leomaurodesenv commited on
Commit
c200df3
β€’
1 Parent(s): 8416f29

feat(app): Run formatting, add caching and loading states

Browse files
Files changed (2) hide show
  1. app.py +105 -34
  2. utils.py +12 -10
app.py CHANGED
@@ -7,40 +7,111 @@ from haystack.document_stores.in_memory import InMemoryDocumentStore
7
 
8
  from utils import get_unique_docs
9
 
 
10
  # Load the dataset
11
- unique_docs = set()
12
- dataset = load_dataset("PedroCJardim/QASports", "basketball")
13
- docs_validation = get_unique_docs(dataset["validation"], unique_docs)
14
- docs_train = get_unique_docs(dataset["train"], unique_docs)
15
- docs_test = get_unique_docs(dataset["test"], unique_docs)
16
- docs_all = docs_validation + docs_train + docs_test
17
-
18
- # Create the Question Answering pipeline
19
- # Create in memory database
20
- document_store = InMemoryDocumentStore()
21
- document_store.write_documents(documents=docs_all)
22
- # Create the retriever and reader
23
- retriever = InMemoryBM25Retriever(document_store=document_store)
24
- reader = ExtractiveReader(model="laurafcamargos/distilbert-qasports-basket-small")
25
- reader.warm_up()
26
- # Create the pipeline
27
- pipe = Pipeline()
28
- pipe.add_component(instance=retriever, name="retriever")
29
- pipe.add_component(instance=reader, name="reader")
30
- pipe.connect("retriever.documents", "reader.documents")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  # Streamlit interface
33
- st.markdown("""This website presents a collection of documents from the dataset named "QASports", the first large sports question answering dataset for open questions. QASports contains real data of players, teams and matches from the sports soccer, basketball and American football. It counts over 1.5 million questions and answers about 54k preprocessed, cleaned and organized documents from Wikipedia-like sources.""")
34
- st.subheader('QASports: Basketball', divider='rainbow')
35
-
36
- top_k = 3
37
- user_query = None
38
- user_query = st.text_input("Please, make a question about basketball:")
39
-
40
- if user_query:
41
- answer = pipe.run(data={
42
- "retriever": {"query": user_query, "top_k": 10},
43
- "reader": {"query": user_query, "top_k": top_k},
44
- })
45
- # Display only the top k answers
46
- st.json(answer["reader"]["answers"][0:top_k])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  from utils import get_unique_docs
9
 
10
+
11
  # Load the dataset
12
+ @st.cache_data(show_spinner=False)
13
+ def load_documents():
14
+ """
15
+ Load the documents from the dataset considering only unique documents.
16
+
17
+ Returns:
18
+ - documents: list of dictionaries with the documents.
19
+ """
20
+ unique_docs = set()
21
+ dataset_name = "PedroCJardim/QASports"
22
+ dataset_split = "basketball"
23
+ st.caption(f'Fetching "{dataset_name}" dataset')
24
+ # build the dataset
25
+ dataset = load_dataset(dataset_name, dataset_split)
26
+ docs_validation = get_unique_docs(dataset["validation"], unique_docs)
27
+ docs_train = get_unique_docs(dataset["train"], unique_docs)
28
+ docs_test = get_unique_docs(dataset["test"], unique_docs)
29
+ documents = docs_validation + docs_train + docs_test
30
+ return documents
31
+
32
+
33
+ @st.cache_data(show_spinner=False)
34
+ def get_document_store(documents):
35
+ """
36
+ Index the files in the document store.
37
+
38
+ Args:
39
+ - files: list of dictionaries with the documents.
40
+ """
41
+ # Create in memory database
42
+ st.caption(f"Building the Document Store")
43
+ document_store = InMemoryDocumentStore()
44
+ document_store.write_documents(documents=documents)
45
+ return document_store
46
+
47
+
48
+ @st.cache_data(show_spinner=False)
49
+ def get_question_pipeline(_doc_store):
50
+ """
51
+ Create the pipeline with the retriever and reader components.
52
+
53
+ Args:
54
+ - doc_store: instance of the document store.
55
+
56
+ Returns:
57
+ - pipe: instance of the pipeline.
58
+ """
59
+ st.caption(f"Building the Question Answering pipeline")
60
+ # Create the retriever and reader
61
+ retriever = InMemoryBM25Retriever(document_store=_doc_store)
62
+ reader = ExtractiveReader(model="laurafcamargos/distilbert-qasports-basket-small")
63
+ reader.warm_up()
64
+ # Create the pipeline
65
+ pipe = Pipeline()
66
+ pipe.add_component(instance=retriever, name="retriever")
67
+ pipe.add_component(instance=reader, name="reader")
68
+ pipe.connect("retriever.documents", "reader.documents")
69
+ return pipe
70
+
71
+
72
+ # # Create the retriever and reader
73
+ # retriever = InMemoryBM25Retriever(document_store=document_store())
74
+ # reader = ExtractiveReader(model="laurafcamargos/distilbert-qasports-basket-small")
75
+ # reader.warm_up()
76
+ # # Create the pipeline
77
+ # pipe = Pipeline()
78
+ # pipe.add_component(instance=retriever, name="retriever")
79
+ # pipe.add_component(instance=reader, name="reader")
80
+ # pipe.connect("retriever.documents", "reader.documents")
81
 
82
  # Streamlit interface
83
+ with st.status(
84
+ "Downloading dataset...", expanded=st.session_state.get("expanded", True)
85
+ ) as status:
86
+ documents = load_documents()
87
+ status.update(label="Indexing documents...")
88
+ doc_store = get_document_store(documents)
89
+ status.update(label="Creating pipeline...")
90
+ pipe = get_question_pipeline(doc_store)
91
+ status.update(
92
+ label="Download and indexing complete!", state="complete", expanded=False
93
+ )
94
+ st.session_state["expanded"] = False
95
+
96
+ st.subheader("πŸ”Ž QASports: Basketball", divider="rainbow")
97
+ st.caption(
98
+ """This website presents a collection of documents from the dataset named "QASports", the first large sports question answering dataset for open questions. QASports contains real data of players, teams and matches from the sports soccer, basketball and American football. It counts over 1.5 million questions and answers about 54k preprocessed, cleaned and organized documents from Wikipedia-like sources."""
99
+ )
100
+
101
+ if user_query := st.text_input(
102
+ label="What do you want to know about Basketball?",
103
+ placeholder="How many field goals did Kobe Bryant score?",
104
+ ):
105
+ try:
106
+ top_k = 3
107
+ answer = pipe.run(
108
+ data={
109
+ "retriever": {"query": user_query, "top_k": 10},
110
+ "reader": {"query": user_query, "top_k": top_k},
111
+ }
112
+ )
113
+ # Display only the top k answers
114
+ max_k = len(answer["reader"]["answers"])
115
+ st.json(answer["reader"]["answers"][0 : min(top_k, max_k)])
116
+ except Exception as e:
117
+ st.error(f"Error: We do not have an answer for your question.")
utils.py CHANGED
@@ -1,17 +1,18 @@
1
- '''This module contains utility functions for the project'''
 
2
  import mmh3
3
  from haystack import Document
4
 
5
 
6
- def get_unique_docs(dataset, unique_docs:set):
7
- '''Get unique documents from dataset
8
-
9
  Args:
10
  dataset: list of dictionaries
11
 
12
  Returns:
13
  docs: list of haystack.Document
14
- '''
15
  docs = list()
16
  for doc in dataset:
17
  if doc["context"] is not None and doc["context_id"] not in unique_docs:
@@ -19,11 +20,12 @@ def get_unique_docs(dataset, unique_docs:set):
19
  document = Document(
20
  content=doc["context"],
21
  meta={
22
- 'title': doc["context_title"],
23
- 'context_id': doc["context_id"],
24
- 'url': doc["url"],
25
- 'source': 'QASports', 'category': 'basketball'
26
- }
 
27
  )
28
  docs.append(document)
29
  return docs
 
1
+ """This module contains utility functions for the project"""
2
+
3
  import mmh3
4
  from haystack import Document
5
 
6
 
7
+ def get_unique_docs(dataset, unique_docs: set):
8
+ """Get unique documents from dataset
9
+
10
  Args:
11
  dataset: list of dictionaries
12
 
13
  Returns:
14
  docs: list of haystack.Document
15
+ """
16
  docs = list()
17
  for doc in dataset:
18
  if doc["context"] is not None and doc["context_id"] not in unique_docs:
 
20
  document = Document(
21
  content=doc["context"],
22
  meta={
23
+ "title": doc["context_title"],
24
+ "context_id": doc["context_id"],
25
+ "url": doc["url"],
26
+ "source": "QASports",
27
+ "category": "basketball",
28
+ },
29
  )
30
  docs.append(document)
31
  return docs