Spaces:

leomaurodesenv
/

qasports-website

Sleeping

App Files Files Community

leomaurodesenv commited on Apr 17, 2024

Commit

8416f29

1 Parent(s): 4ce2e5d

feat(app): Add question answering for basketball, update the requirements

Browse files

Files changed (3) hide show

app.py +35 -11
requirements.txt +2 -0
utils.py +10 -3

app.py CHANGED Viewed

@@ -1,22 +1,46 @@
 import streamlit as st
 from datasets import load_dataset
-from haystack import Document
 from haystack.components.readers import ExtractiveReader
 # Load the dataset
-dataset = load_dataset("PedroCJardim/QASports", "basketball", split="validation")
-# Load the model
 reader = ExtractiveReader(model="laurafcamargos/distilbert-qasports-basket-small")
 reader.warm_up()
-# Running using the Reader
-docs = [
-    Document(content="Paris is the capital of France."),
-    Document(content="Berlin is the capital of Germany.")
-]
-query = "What is the capital of France?"
-answer = reader.run(query="What is the capital of France?", documents=docs, top_k=1)
-st.json(answer)

 import streamlit as st
 from datasets import load_dataset
+from haystack import Pipeline
 from haystack.components.readers import ExtractiveReader
+from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
+from haystack.document_stores.in_memory import InMemoryDocumentStore
+from utils import get_unique_docs
 # Load the dataset
+unique_docs = set()
+dataset = load_dataset("PedroCJardim/QASports", "basketball")
+docs_validation = get_unique_docs(dataset["validation"], unique_docs)
+docs_train = get_unique_docs(dataset["train"], unique_docs)
+docs_test = get_unique_docs(dataset["test"], unique_docs)
+docs_all = docs_validation + docs_train + docs_test
+# Create the Question Answering pipeline
+# Create in memory database
+document_store = InMemoryDocumentStore()
+document_store.write_documents(documents=docs_all)
+# Create the retriever and reader
+retriever = InMemoryBM25Retriever(document_store=document_store)
 reader = ExtractiveReader(model="laurafcamargos/distilbert-qasports-basket-small")
 reader.warm_up()
+# Create the pipeline
+pipe = Pipeline()
+pipe.add_component(instance=retriever, name="retriever")
+pipe.add_component(instance=reader, name="reader")
+pipe.connect("retriever.documents", "reader.documents")
+# Streamlit interface
+st.markdown("""This website presents a collection of documents from the dataset named "QASports", the first large sports question answering dataset for open questions. QASports contains real data of players, teams and matches from the sports soccer, basketball and American football. It counts over 1.5 million questions and answers about 54k preprocessed, cleaned and organized documents from Wikipedia-like sources.""")
+st.subheader('QASports: Basketball', divider='rainbow')
+top_k = 3
+user_query = None
+user_query = st.text_input("Please, make a question about basketball:")
+if user_query:
+    answer = pipe.run(data={
+        "retriever": {"query": user_query, "top_k": 10},
+        "reader": {"query": user_query, "top_k": top_k},
+    })
+    # Display only the top k answers
+    st.json(answer["reader"]["answers"][0:top_k])

requirements.txt CHANGED Viewed

@@ -7,3 +7,5 @@ datasets==2.18.0
 haystack-ai==2.0.1
 accelerate==0.29.2
 sentence-transformers==2.7.0

 haystack-ai==2.0.1
 accelerate==0.29.2
 sentence-transformers==2.7.0
+# Extra
+mmh3==4.1.0

utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ import mmh3
 from haystack import Document
-def get_unique_docs(dataset):
     '''Get unique documents from dataset
     Args:
@@ -12,11 +12,18 @@ def get_unique_docs(dataset):
     Returns:
     docs: list of haystack.Document
     '''
-    unique_docs = set()
     docs = list()
     for doc in dataset:
         if doc["context"] is not None and doc["context_id"] not in unique_docs:
             unique_docs.add(doc["context_id"])
-            document = Document(content=doc["context"], meta={'title': doc["context_title"], 'context_id': doc["context_id"]})
             docs.append(document)
     return docs

 from haystack import Document
+def get_unique_docs(dataset, unique_docs:set):
     '''Get unique documents from dataset
     Args:
     Returns:
     docs: list of haystack.Document
     '''
     docs = list()
     for doc in dataset:
         if doc["context"] is not None and doc["context_id"] not in unique_docs:
             unique_docs.add(doc["context_id"])
+            document = Document(
+                content=doc["context"],
+                meta={
+                    'title': doc["context_title"],
+                    'context_id': doc["context_id"],
+                    'url': doc["url"],
+                    'source': 'QASports', 'category': 'basketball'
+                }
+            )
             docs.append(document)
     return docs