LeoGitGuy commited on
Commit
ee69da4
1 Parent(s): 62a1912

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +53 -0
  2. eval_helper.py +80 -0
  3. requirements.txt +12 -0
  4. setup_database.py +22 -0
  5. setup_modules.py +48 -0
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from setup_database import get_document_store, add_data
4
+ from setup_modules import create_retriever, create_readers_and_pipeline, text_reader_types, table_reader_types
5
+
6
+ document_index = "document"
7
+ document_store = get_document_store(document_index)
8
+ filenames = ["processed_website_tables","processed_website_text","processed_schedule_tables"]
9
+ document_store, data = add_data(filenames, document_store, document_index)
10
+ document_store, retriever = create_retriever(document_store)
11
+ text_reader_type = text_reader_types['deberta-large']
12
+ table_reader_type = table_reader_types['tapas']
13
+ pipeline = create_readers_and_pipeline(retriever, text_reader_type, table_reader_type, True, True)
14
+
15
+ title = "Welcome to the BounWiki: The Question Answering Enginge for Bogazici Students!"
16
+
17
+ head = '''
18
+ This engine uses information from the Bogazici University Website to answer questions about different areas such as:
19
+
20
+ - Semester Dates (e.g. Registration Period, Add/Dropp Period...)
21
+ - Campus buildings and their locations
22
+ - General Uni Information, like Busses from Uni, Taxi-Numbers
23
+ - Schedule Information for all courses
24
+
25
+ It returns the top
26
+ '''
27
+
28
+
29
+ article = '''
30
+ # How does this work?
31
+
32
+ This App uses an "MPNet" sentence-transformer to encode information from the website into an embedding space.
33
+ When faced with a query, the semantically most similar document is retrieved.
34
+ A language model ("deberta-large" here) extracts the answer to the original question from this document and returns it to the interface
35
+ '''
36
+
37
+
38
+ examples = [
39
+ ["When is the add/dropp period?"],
40
+ ["What does it mean if instructor consent is required?"],
41
+ ["Where is the english preparatory unit located?"],
42
+ ]
43
+
44
+ label = gr.outputs.Label(num_top_classes=3)
45
+
46
+ def predict(input):
47
+ prediction = pipeline.run(
48
+ query=input, params={"top_k": 3}
49
+ )
50
+ return {a.answer: float(a.score) for a in prediction["answers"]}
51
+
52
+ interface = gr.Interface(fn=predict, inputs=gr.Textbox(lines=5, max_lines=6, label="Input Text"), outputs=label, title=title, description=head, article=article, examples=examples)
53
+ interface.launch(server_name="0.0.0.0", server_port=8080)
eval_helper.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack import Label, MultiLabel, Answer
2
+ import json
3
+ import re
4
+
5
+ def read_labels(labels, tables):
6
+ processed_labels = []
7
+ for table in tables:
8
+ if table.id not in labels:
9
+ continue
10
+ doc_labels = labels[table.id]
11
+ for label in doc_labels:
12
+ label = Label(
13
+ query=label["question"],
14
+ document=table,
15
+ is_correct_answer=True,
16
+ is_correct_document=True,
17
+ answer=Answer(answer=label["answers"][0]["text"]),
18
+ origin="gold-label",
19
+ )
20
+ processed_labels.append(MultiLabel(labels=[label]))
21
+ return processed_labels
22
+
23
+ def create_labels(labels_file, data, seperate_eval):
24
+ eval_labels = []
25
+ with open(labels_file) as labels_file:
26
+ labels = json.load(labels_file)
27
+ if seperate_eval:
28
+ use_labels = filter_labels(labels)
29
+ else:
30
+ use_labels = [labels]
31
+ for l in use_labels:
32
+ labels = []
33
+ for d in data:
34
+ labels += read_labels(l, d)
35
+ print(f"Number of Labels: {len(labels)}")
36
+ eval_labels.append(labels)
37
+ return eval_labels
38
+
39
+ def get_processed_squad_labels(squad_labels):
40
+ with open(f'./data/validation_data/{squad_labels}') as fp:
41
+ squad_labels = json.load(fp)
42
+ # Process Squad File by aligning the right document IDs for the course schedules
43
+ processed_squad_labels = {}
44
+ for paragraph in squad_labels["data"]:
45
+ context = paragraph["paragraphs"][0]["context"]
46
+ if context[:43] == "Code\tName\tEcts\tInstructor\tDays\tHours\tRooms\n":
47
+ faculty_abb = re.search(r"[a-z]*", context[43:], re.IGNORECASE).group()
48
+ if faculty_abb in processed_squad_labels:
49
+ processed_squad_labels[faculty_abb].extend(paragraph["paragraphs"][0]["qas"])
50
+ else:
51
+ processed_squad_labels[faculty_abb] = paragraph["paragraphs"][0]["qas"]
52
+ else:
53
+ processed_squad_labels[str(paragraph["paragraphs"][0]["document_id"])] = paragraph["paragraphs"][0]["qas"]
54
+
55
+ with open("./data/validation_data/processed_qa.json", "w") as outfile:
56
+ json.dump(processed_squad_labels, outfile)
57
+ #return processed_squad_labels
58
+
59
+ def filter_labels(labels):
60
+ with open("./data/validation_data/questions_new.txt", "r") as fp:
61
+ user_questions = fp.read()
62
+
63
+ user_questions = user_questions.split("\n")
64
+ user_questions = [qu.strip() for qu in user_questions]
65
+ user_squad_labels = {}
66
+ synthetic_squad_labels = {}
67
+ for doc, questions in labels.items():
68
+ for q in questions:
69
+ if q["question"].strip() in user_questions:
70
+ if doc in user_squad_labels:
71
+ user_squad_labels[doc].append(q)
72
+ else:
73
+ user_squad_labels[doc] = [q]
74
+ else:
75
+ if doc in synthetic_squad_labels:
76
+ synthetic_squad_labels[doc].append(q)
77
+ else:
78
+ synthetic_squad_labels[doc] = [q]
79
+
80
+ return [user_squad_labels, synthetic_squad_labels]
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack
2
+ farm-haystack
3
+ fastapi
4
+ requests
5
+ torch==1.13.*
6
+ transformers
7
+ uvicorn[standard]
8
+ grpcio
9
+ numpy
10
+ beautifulsoup4
11
+ gradio
12
+
setup_database.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.document_stores import ElasticsearchDocumentStore
2
+ from haystack.document_stores import InMemoryDocumentStore
3
+ import os
4
+ import pickle
5
+
6
+ # def get_document_store(document_index):
7
+ # host = os.environ.get("ELASTICSEARCH_HOST", "localhost")
8
+ # document_store = ElasticsearchDocumentStore(host=host, username="", password="", index=document_index)
9
+ # return document_store
10
+
11
+ def add_data(filenames, document_store, document_index):
12
+ data = []
13
+ for filename in filenames:
14
+ with open(f"./data/website_data/{filename}", "rb") as fp:
15
+ file = pickle.load(fp)
16
+ data.append(file)
17
+ document_store.write_documents(file, index=document_index)
18
+ return document_store, data
19
+
20
+ def get_document_store(document_index):
21
+ document_store = InMemoryDocumentStore(index=document_index)
22
+ return document_store
setup_modules.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.nodes.retriever import EmbeddingRetriever
2
+ from haystack.nodes import TableReader, FARMReader, RouteDocuments, JoinAnswers
3
+ from haystack import Pipeline
4
+
5
+ text_reader_types = {
6
+ "minilm": "deepset/minilm-uncased-squad2",
7
+ "distilroberta": "deepset/tinyroberta-squad2",
8
+ "electra-base": "deepset/electra-base-squad2",
9
+ "bert-base": "deepset/bert-base-cased-squad2",
10
+ "deberta-large": "deepset/deberta-v3-large-squad2",
11
+ "gpt3": "implement openai answer generator"
12
+ }
13
+ table_reader_types = {
14
+ "tapas": "deepset/tapas-large-nq-hn-reader",
15
+ "text": "implement changing tables to text"
16
+ }
17
+
18
+
19
+ def create_retriever(document_store):
20
+ retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/all-mpnet-base-v2-table")
21
+ document_store.update_embeddings(retriever=retriever)
22
+ return document_store, retriever
23
+
24
+ def create_readers_and_pipeline(retriever, text_reader_type = "deepset/roberta-base-squad2", table_reader_type="deepset/tapas-large-nq-hn-reader", use_table=True, use_text=True):
25
+ both = (use_table and use_text)
26
+ if use_text or both:
27
+ print("Initializing Text reader..")
28
+ text_reader = FARMReader(text_reader_type)
29
+ if use_table or both:
30
+ print("Initializing table reader..")
31
+ table_reader = TableReader(table_reader_type)
32
+ if both:
33
+ route_documents = RouteDocuments()
34
+ join_answers = JoinAnswers()
35
+
36
+ text_table_qa_pipeline = Pipeline()
37
+ text_table_qa_pipeline.add_node(component=retriever, name="EmbeddingRetriever", inputs=["Query"])
38
+ if use_table and not use_text:
39
+ text_table_qa_pipeline.add_node(component=table_reader, name="TableReader", inputs=["EmbeddingRetriever"])
40
+ elif use_text and not use_table:
41
+ text_table_qa_pipeline.add_node(component=text_reader, name="TextReader", inputs=["EmbeddingRetriever"])
42
+ elif both:
43
+ text_table_qa_pipeline.add_node(component=route_documents, name="RouteDocuments", inputs=["EmbeddingRetriever"])
44
+ text_table_qa_pipeline.add_node(component=text_reader, name="TextReader", inputs=["RouteDocuments.output_1"])
45
+ text_table_qa_pipeline.add_node(component=table_reader, name="TableReader", inputs=["RouteDocuments.output_2"])
46
+ text_table_qa_pipeline.add_node(component=join_answers, name="JoinAnswers", inputs=["TextReader", "TableReader"])
47
+
48
+ return text_table_qa_pipeline