File size: 4,765 Bytes
ffe3c68
80fa8ee
8416f29
80fa8ee
8416f29
 
 
 
ffe3c68
c200df3
80fa8ee
c200df3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15ea144
c200df3
 
 
 
 
 
 
 
 
 
 
 
 
 
15ea144
c200df3
 
 
 
 
 
 
 
 
 
 
 
 
cc5e36a
c200df3
 
 
 
 
 
 
 
 
d104f0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80fa8ee
8416f29
6618bc7
 
 
 
 
c200df3
 
 
 
 
 
 
 
 
 
 
 
 
6618bc7
c200df3
 
 
 
 
cc5e36a
c200df3
 
d104f0e
 
 
 
15ea144
 
 
 
 
 
 
 
 
 
 
d104f0e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import streamlit as st
from datasets import load_dataset
from haystack import Pipeline
from haystack.components.readers import ExtractiveReader
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from haystack.document_stores.in_memory import InMemoryDocumentStore

from utils import get_unique_docs


# Load the dataset
@st.cache_data(show_spinner=False)
def load_documents():
    """
    Load the documents from the dataset considering only unique documents.

    Returns:
    - documents: list of dictionaries with the documents.
    """
    unique_docs = set()
    dataset_name = "PedroCJardim/QASports"
    dataset_split = "basketball"
    st.caption(f'Fetching "{dataset_name}" dataset')
    # build the dataset
    dataset = load_dataset(dataset_name, dataset_split)
    docs_validation = get_unique_docs(dataset["validation"], unique_docs)
    docs_train = get_unique_docs(dataset["train"], unique_docs)
    docs_test = get_unique_docs(dataset["test"], unique_docs)
    documents = docs_validation + docs_train + docs_test
    return documents


@st.cache_resource(show_spinner=False)
def get_document_store(documents):
    """
    Index the files in the document store.

    Args:
    - files: list of dictionaries with the documents.
    """
    # Create in memory database
    st.caption(f"Building the Document Store")
    document_store = InMemoryDocumentStore()
    document_store.write_documents(documents=documents)
    return document_store


@st.cache_resource(show_spinner=False)
def get_question_pipeline(_doc_store):
    """
    Create the pipeline with the retriever and reader components.

    Args:
    - doc_store: instance of the document store.

    Returns:
    - pipe: instance of the pipeline.
    """
    st.caption(f"Building the Question Answering pipeline")
    # Create the retriever and reader
    retriever = InMemoryBM25Retriever(document_store=_doc_store)
    reader = ExtractiveReader(model="deepset/roberta-base-squad2")
    reader.warm_up()
    # Create the pipeline
    pipe = Pipeline()
    pipe.add_component(instance=retriever, name="retriever")
    pipe.add_component(instance=reader, name="reader")
    pipe.connect("retriever.documents", "reader.documents")
    return pipe


def search(pipeline, question: str):
    """
    Search for the answer to a question in the documents.

    Args:
    - pipeline: instance of the pipeline.
    - question: string with the question.

    Returns:
    - answer: dictionary with the answer.
    """
    # Get the answers
    top_k = 3
    answer = pipeline.run(
        data={
            "retriever": {"query": question, "top_k": 10},
            "reader": {"query": question, "top_k": top_k},
        }
    )
    max_k = min(top_k, len(answer["reader"]["answers"]))
    return answer["reader"]["answers"][0:max_k]


# Streamlit interface
_, centering_column, _ = st.columns(3)
with centering_column:
    st.image("assets/qasports-logo.png", use_column_width=True)

# Loading status
with st.status(
    "Downloading dataset...", expanded=st.session_state.get("expanded", True)
) as status:
    documents = load_documents()
    status.update(label="Indexing documents...")
    doc_store = get_document_store(documents)
    status.update(label="Creating pipeline...")
    pipe = get_question_pipeline(doc_store)
    status.update(
        label="Download and indexing complete!", state="complete", expanded=False
    )
    st.session_state["expanded"] = False

st.subheader("πŸ”Ž Basketball", divider="rainbow")
st.caption(
    """This website presents a collection of documents from the dataset named "QASports", the first large sports question answering dataset for open questions. QASports contains real data of players, teams and matches from the sports soccer, basketball and American football. It counts over 1.5 million questions and answers about 54k preprocessed, cleaned and organized documents from Wikipedia-like sources."""
)

if user_query := st.text_input(
    label="Ask a question about Basketball! πŸ€",
    placeholder="How many field goals did Kobe Bryant score?",
):
    # Get the answers
    with st.spinner("Waiting"):
        try:
            answer = search(pipe, user_query)
            for idx, ans in enumerate(answer):
                st.info(
                    f"""
                    Answer {idx+1}: "{ans.data}" | Score: {ans.score:0.4f}  
                    Document: "{ans.document.meta["title"]}"  
                    URL: {ans.document.meta["url"]}
                """
                )
                with st.expander("See details", expanded=False):
                    st.write(ans)
                st.divider()
        except Exception as e:
            st.error("We do not have an answer for your question")