Spaces:
Runtime error
Runtime error
Upload 5 files
Browse files- app.py +53 -0
- eval_helper.py +80 -0
- requirements.txt +12 -0
- setup_database.py +22 -0
- setup_modules.py +48 -0
app.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
from setup_database import get_document_store, add_data
|
4 |
+
from setup_modules import create_retriever, create_readers_and_pipeline, text_reader_types, table_reader_types
|
5 |
+
|
6 |
+
document_index = "document"
|
7 |
+
document_store = get_document_store(document_index)
|
8 |
+
filenames = ["processed_website_tables","processed_website_text","processed_schedule_tables"]
|
9 |
+
document_store, data = add_data(filenames, document_store, document_index)
|
10 |
+
document_store, retriever = create_retriever(document_store)
|
11 |
+
text_reader_type = text_reader_types['deberta-large']
|
12 |
+
table_reader_type = table_reader_types['tapas']
|
13 |
+
pipeline = create_readers_and_pipeline(retriever, text_reader_type, table_reader_type, True, True)
|
14 |
+
|
15 |
+
title = "Welcome to the BounWiki: The Question Answering Enginge for Bogazici Students!"
|
16 |
+
|
17 |
+
head = '''
|
18 |
+
This engine uses information from the Bogazici University Website to answer questions about different areas such as:
|
19 |
+
|
20 |
+
- Semester Dates (e.g. Registration Period, Add/Dropp Period...)
|
21 |
+
- Campus buildings and their locations
|
22 |
+
- General Uni Information, like Busses from Uni, Taxi-Numbers
|
23 |
+
- Schedule Information for all courses
|
24 |
+
|
25 |
+
It returns the top
|
26 |
+
'''
|
27 |
+
|
28 |
+
|
29 |
+
article = '''
|
30 |
+
# How does this work?
|
31 |
+
|
32 |
+
This App uses an "MPNet" sentence-transformer to encode information from the website into an embedding space.
|
33 |
+
When faced with a query, the semantically most similar document is retrieved.
|
34 |
+
A language model ("deberta-large" here) extracts the answer to the original question from this document and returns it to the interface
|
35 |
+
'''
|
36 |
+
|
37 |
+
|
38 |
+
examples = [
|
39 |
+
["When is the add/dropp period?"],
|
40 |
+
["What does it mean if instructor consent is required?"],
|
41 |
+
["Where is the english preparatory unit located?"],
|
42 |
+
]
|
43 |
+
|
44 |
+
label = gr.outputs.Label(num_top_classes=3)
|
45 |
+
|
46 |
+
def predict(input):
|
47 |
+
prediction = pipeline.run(
|
48 |
+
query=input, params={"top_k": 3}
|
49 |
+
)
|
50 |
+
return {a.answer: float(a.score) for a in prediction["answers"]}
|
51 |
+
|
52 |
+
interface = gr.Interface(fn=predict, inputs=gr.Textbox(lines=5, max_lines=6, label="Input Text"), outputs=label, title=title, description=head, article=article, examples=examples)
|
53 |
+
interface.launch(server_name="0.0.0.0", server_port=8080)
|
eval_helper.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from haystack import Label, MultiLabel, Answer
|
2 |
+
import json
|
3 |
+
import re
|
4 |
+
|
5 |
+
def read_labels(labels, tables):
|
6 |
+
processed_labels = []
|
7 |
+
for table in tables:
|
8 |
+
if table.id not in labels:
|
9 |
+
continue
|
10 |
+
doc_labels = labels[table.id]
|
11 |
+
for label in doc_labels:
|
12 |
+
label = Label(
|
13 |
+
query=label["question"],
|
14 |
+
document=table,
|
15 |
+
is_correct_answer=True,
|
16 |
+
is_correct_document=True,
|
17 |
+
answer=Answer(answer=label["answers"][0]["text"]),
|
18 |
+
origin="gold-label",
|
19 |
+
)
|
20 |
+
processed_labels.append(MultiLabel(labels=[label]))
|
21 |
+
return processed_labels
|
22 |
+
|
23 |
+
def create_labels(labels_file, data, seperate_eval):
|
24 |
+
eval_labels = []
|
25 |
+
with open(labels_file) as labels_file:
|
26 |
+
labels = json.load(labels_file)
|
27 |
+
if seperate_eval:
|
28 |
+
use_labels = filter_labels(labels)
|
29 |
+
else:
|
30 |
+
use_labels = [labels]
|
31 |
+
for l in use_labels:
|
32 |
+
labels = []
|
33 |
+
for d in data:
|
34 |
+
labels += read_labels(l, d)
|
35 |
+
print(f"Number of Labels: {len(labels)}")
|
36 |
+
eval_labels.append(labels)
|
37 |
+
return eval_labels
|
38 |
+
|
39 |
+
def get_processed_squad_labels(squad_labels):
|
40 |
+
with open(f'./data/validation_data/{squad_labels}') as fp:
|
41 |
+
squad_labels = json.load(fp)
|
42 |
+
# Process Squad File by aligning the right document IDs for the course schedules
|
43 |
+
processed_squad_labels = {}
|
44 |
+
for paragraph in squad_labels["data"]:
|
45 |
+
context = paragraph["paragraphs"][0]["context"]
|
46 |
+
if context[:43] == "Code\tName\tEcts\tInstructor\tDays\tHours\tRooms\n":
|
47 |
+
faculty_abb = re.search(r"[a-z]*", context[43:], re.IGNORECASE).group()
|
48 |
+
if faculty_abb in processed_squad_labels:
|
49 |
+
processed_squad_labels[faculty_abb].extend(paragraph["paragraphs"][0]["qas"])
|
50 |
+
else:
|
51 |
+
processed_squad_labels[faculty_abb] = paragraph["paragraphs"][0]["qas"]
|
52 |
+
else:
|
53 |
+
processed_squad_labels[str(paragraph["paragraphs"][0]["document_id"])] = paragraph["paragraphs"][0]["qas"]
|
54 |
+
|
55 |
+
with open("./data/validation_data/processed_qa.json", "w") as outfile:
|
56 |
+
json.dump(processed_squad_labels, outfile)
|
57 |
+
#return processed_squad_labels
|
58 |
+
|
59 |
+
def filter_labels(labels):
|
60 |
+
with open("./data/validation_data/questions_new.txt", "r") as fp:
|
61 |
+
user_questions = fp.read()
|
62 |
+
|
63 |
+
user_questions = user_questions.split("\n")
|
64 |
+
user_questions = [qu.strip() for qu in user_questions]
|
65 |
+
user_squad_labels = {}
|
66 |
+
synthetic_squad_labels = {}
|
67 |
+
for doc, questions in labels.items():
|
68 |
+
for q in questions:
|
69 |
+
if q["question"].strip() in user_questions:
|
70 |
+
if doc in user_squad_labels:
|
71 |
+
user_squad_labels[doc].append(q)
|
72 |
+
else:
|
73 |
+
user_squad_labels[doc] = [q]
|
74 |
+
else:
|
75 |
+
if doc in synthetic_squad_labels:
|
76 |
+
synthetic_squad_labels[doc].append(q)
|
77 |
+
else:
|
78 |
+
synthetic_squad_labels[doc] = [q]
|
79 |
+
|
80 |
+
return [user_squad_labels, synthetic_squad_labels]
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack
|
2 |
+
farm-haystack
|
3 |
+
fastapi
|
4 |
+
requests
|
5 |
+
torch==1.13.*
|
6 |
+
transformers
|
7 |
+
uvicorn[standard]
|
8 |
+
grpcio
|
9 |
+
numpy
|
10 |
+
beautifulsoup4
|
11 |
+
gradio
|
12 |
+
|
setup_database.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from haystack.document_stores import ElasticsearchDocumentStore
|
2 |
+
from haystack.document_stores import InMemoryDocumentStore
|
3 |
+
import os
|
4 |
+
import pickle
|
5 |
+
|
6 |
+
# def get_document_store(document_index):
|
7 |
+
# host = os.environ.get("ELASTICSEARCH_HOST", "localhost")
|
8 |
+
# document_store = ElasticsearchDocumentStore(host=host, username="", password="", index=document_index)
|
9 |
+
# return document_store
|
10 |
+
|
11 |
+
def add_data(filenames, document_store, document_index):
|
12 |
+
data = []
|
13 |
+
for filename in filenames:
|
14 |
+
with open(f"./data/website_data/{filename}", "rb") as fp:
|
15 |
+
file = pickle.load(fp)
|
16 |
+
data.append(file)
|
17 |
+
document_store.write_documents(file, index=document_index)
|
18 |
+
return document_store, data
|
19 |
+
|
20 |
+
def get_document_store(document_index):
|
21 |
+
document_store = InMemoryDocumentStore(index=document_index)
|
22 |
+
return document_store
|
setup_modules.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from haystack.nodes.retriever import EmbeddingRetriever
|
2 |
+
from haystack.nodes import TableReader, FARMReader, RouteDocuments, JoinAnswers
|
3 |
+
from haystack import Pipeline
|
4 |
+
|
5 |
+
text_reader_types = {
|
6 |
+
"minilm": "deepset/minilm-uncased-squad2",
|
7 |
+
"distilroberta": "deepset/tinyroberta-squad2",
|
8 |
+
"electra-base": "deepset/electra-base-squad2",
|
9 |
+
"bert-base": "deepset/bert-base-cased-squad2",
|
10 |
+
"deberta-large": "deepset/deberta-v3-large-squad2",
|
11 |
+
"gpt3": "implement openai answer generator"
|
12 |
+
}
|
13 |
+
table_reader_types = {
|
14 |
+
"tapas": "deepset/tapas-large-nq-hn-reader",
|
15 |
+
"text": "implement changing tables to text"
|
16 |
+
}
|
17 |
+
|
18 |
+
|
19 |
+
def create_retriever(document_store):
|
20 |
+
retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/all-mpnet-base-v2-table")
|
21 |
+
document_store.update_embeddings(retriever=retriever)
|
22 |
+
return document_store, retriever
|
23 |
+
|
24 |
+
def create_readers_and_pipeline(retriever, text_reader_type = "deepset/roberta-base-squad2", table_reader_type="deepset/tapas-large-nq-hn-reader", use_table=True, use_text=True):
|
25 |
+
both = (use_table and use_text)
|
26 |
+
if use_text or both:
|
27 |
+
print("Initializing Text reader..")
|
28 |
+
text_reader = FARMReader(text_reader_type)
|
29 |
+
if use_table or both:
|
30 |
+
print("Initializing table reader..")
|
31 |
+
table_reader = TableReader(table_reader_type)
|
32 |
+
if both:
|
33 |
+
route_documents = RouteDocuments()
|
34 |
+
join_answers = JoinAnswers()
|
35 |
+
|
36 |
+
text_table_qa_pipeline = Pipeline()
|
37 |
+
text_table_qa_pipeline.add_node(component=retriever, name="EmbeddingRetriever", inputs=["Query"])
|
38 |
+
if use_table and not use_text:
|
39 |
+
text_table_qa_pipeline.add_node(component=table_reader, name="TableReader", inputs=["EmbeddingRetriever"])
|
40 |
+
elif use_text and not use_table:
|
41 |
+
text_table_qa_pipeline.add_node(component=text_reader, name="TextReader", inputs=["EmbeddingRetriever"])
|
42 |
+
elif both:
|
43 |
+
text_table_qa_pipeline.add_node(component=route_documents, name="RouteDocuments", inputs=["EmbeddingRetriever"])
|
44 |
+
text_table_qa_pipeline.add_node(component=text_reader, name="TextReader", inputs=["RouteDocuments.output_1"])
|
45 |
+
text_table_qa_pipeline.add_node(component=table_reader, name="TableReader", inputs=["RouteDocuments.output_2"])
|
46 |
+
text_table_qa_pipeline.add_node(component=join_answers, name="JoinAnswers", inputs=["TextReader", "TableReader"])
|
47 |
+
|
48 |
+
return text_table_qa_pipeline
|