Stefano Fiorucci
commited on
Commit
β’
418ba7e
1
Parent(s):
6e587e4
improved style and added some questions
Browse files- app.py +88 -60
- data/questions.txt +8 -1
app.py
CHANGED
@@ -2,7 +2,6 @@
|
|
2 |
import time
|
3 |
import streamlit as st
|
4 |
import logging
|
5 |
-
import pandas as pd
|
6 |
from json import JSONDecodeError
|
7 |
from markdown import markdown
|
8 |
import random
|
@@ -20,56 +19,71 @@ from urllib.parse import unquote
|
|
20 |
|
21 |
# FAISS index directory
|
22 |
INDEX_DIR = 'data/index'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
# pipe=None
|
24 |
|
25 |
# the following function is cached to make index and models load only at start
|
26 |
-
|
|
|
|
|
|
|
27 |
def start_haystack():
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
use_gpu=False,
|
43 |
-
confidence_threshold=
|
44 |
-
|
45 |
-
|
|
|
46 |
|
47 |
@st.cache()
|
48 |
def load_questions():
|
49 |
-
with open(
|
50 |
questions = [line.strip() for line in fin.readlines()
|
51 |
-
|
52 |
-
return questions
|
|
|
53 |
|
54 |
def set_state_if_absent(key, value):
|
55 |
if key not in st.session_state:
|
56 |
st.session_state[key] = value
|
57 |
|
58 |
-
pipe=start_haystack()
|
59 |
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
61 |
@st.cache(persist=True, allow_output_mutation=True)
|
62 |
-
def query(question: str, retriever_top_k:int=10, reader_top_k:int=5):
|
63 |
"""Run query and get answers"""
|
64 |
-
params = {"Retriever": {"top_k": retriever_top_k},
|
65 |
"Reader": {"top_k": reader_top_k}}
|
66 |
results = pipe.run(question, params=params)
|
67 |
return results
|
68 |
|
69 |
|
70 |
def main():
|
71 |
-
|
72 |
-
|
73 |
questions = load_questions()
|
74 |
|
75 |
# Persistent state
|
@@ -87,7 +101,7 @@ def main():
|
|
87 |
|
88 |
# sidebar style
|
89 |
st.markdown(
|
90 |
-
|
91 |
<style>
|
92 |
[data-testid="stSidebar"][aria-expanded="true"] > div:first-child{
|
93 |
width: 350px;
|
@@ -97,23 +111,25 @@ def main():
|
|
97 |
margin-left: -350px;
|
98 |
}
|
99 |
""",
|
100 |
-
|
101 |
)
|
102 |
# Title
|
103 |
st.write("# Who killed Laura Palmer?")
|
104 |
st.write("### The first Twin Peaks Question Answering system!")
|
105 |
-
|
106 |
st.markdown("""
|
107 |
-
Ask any question about
|
108 |
and see if the AI ββcan find an answer...
|
109 |
|
110 |
*Note: do not use keywords, but full-fledged questions.*
|
111 |
""")
|
112 |
|
113 |
# Sidebar
|
114 |
-
st.sidebar.header("Who killed Laura Palmer?")
|
115 |
-
st.sidebar.image(
|
116 |
-
|
|
|
|
|
117 |
st.sidebar.markdown(f"""
|
118 |
<style>
|
119 |
a {{
|
@@ -139,7 +155,8 @@ and see if the AI ββcan find an answer...
|
|
139 |
<div class="haystack-footer">
|
140 |
<p><a href="https://github.com/anakin87/who-killed-laura-palmer">GitHub</a> -
|
141 |
Built with <a href="https://github.com/deepset-ai/haystack/">Haystack</a><br/>
|
142 |
-
<small>Data crawled from <a href="https://twinpeaks.fandom.com/wiki/Twin_Peaks_Wiki">
|
|
|
143 |
</p>
|
144 |
<img src = 'https://static.wikia.nocookie.net/twinpeaks/images/e/ef/Laura_Palmer%2C_the_Queen_Of_Hearts.jpg'/>
|
145 |
<br/>
|
@@ -150,17 +167,19 @@ and see if the AI ββcan find an answer...
|
|
150 |
st.sidebar.markdown("""
|
151 |
<p align="center">
|
152 |
<iframe style="border-radius:12px" src="https://open.spotify.com/embed/playlist/38rrtWgflrw7grB37aMlsO?utm_source=generator" width="85%" height="380" frameBorder="0" allowfullscreen="" allow="autoplay; clipboard-write; encrypted-media; fullscreen; picture-in-picture"></iframe>
|
153 |
-
</p>""", unsafe_allow_html=True)
|
154 |
|
155 |
# Search bar
|
156 |
question = st.text_input("",
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
col1, col2 = st.columns(2)
|
162 |
-
col1.markdown(
|
163 |
-
|
|
|
|
|
164 |
|
165 |
# Run button
|
166 |
run_pressed = col1.button("Run")
|
@@ -169,22 +188,24 @@ and see if the AI ββcan find an answer...
|
|
169 |
if col2.button("Random question"):
|
170 |
reset_results()
|
171 |
question = random.choice(questions)
|
172 |
-
|
|
|
173 |
question = random.choice(questions)
|
174 |
st.session_state.question = question
|
175 |
-
# st.session_state.answer = new_row["Answer"].values[0]
|
176 |
st.session_state.random_question_requested = True
|
177 |
# Re-runs the script setting the random question as the textbox value
|
178 |
# Unfortunately necessary as the Random Question button is _below_ the textbox
|
179 |
-
raise st.script_runner.RerunException(
|
|
|
180 |
else:
|
181 |
st.session_state.random_question_requested = False
|
182 |
-
|
183 |
-
run_query = (run_pressed or question != st.session_state.question)
|
|
|
184 |
|
185 |
# Get results for query
|
186 |
if run_query and question:
|
187 |
-
time_start=time.time()
|
188 |
reset_results()
|
189 |
st.session_state.question = question
|
190 |
|
@@ -193,11 +214,13 @@ and see if the AI ββcan find an answer...
|
|
193 |
|
194 |
):
|
195 |
try:
|
196 |
-
st.session_state.results = query(
|
197 |
-
|
|
|
198 |
print(f'elapsed time: {time_end - time_start}')
|
199 |
except JSONDecodeError as je:
|
200 |
-
st.error(
|
|
|
201 |
return
|
202 |
except Exception as e:
|
203 |
logging.exception(e)
|
@@ -207,28 +230,33 @@ and see if the AI ββcan find an answer...
|
|
207 |
if st.session_state.results:
|
208 |
st.write("## Results:")
|
209 |
|
210 |
-
alert_irrelevance=True
|
211 |
-
if len(st.session_state.results['answers'])==0:
|
212 |
st.info("π€ Haystack is unsure whether any of the documents contain an answer to your question. Try to reformulate it!")
|
213 |
|
214 |
for count, result in enumerate(st.session_state.results['answers']):
|
215 |
-
result=result.to_dict()
|
216 |
if result["answer"]:
|
217 |
-
if alert_irrelevance and result['score']<0.50:
|
218 |
alert_irrelevance = False
|
219 |
st.write("""
|
220 |
<h4 style='color: darkred'>Attention, the
|
221 |
following answers have low relevance:</h4>""",
|
222 |
-
|
223 |
|
224 |
answer, context = result["answer"], result["context"]
|
225 |
start_idx = context.find(answer)
|
226 |
end_idx = start_idx + len(answer)
|
227 |
# Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
|
228 |
-
st.write(markdown("- ..."+context[:start_idx] +
|
|
|
|
|
229 |
source = ""
|
230 |
-
name = unquote(result['meta']['name']).replace('_',' ')
|
231 |
url = result['meta']['url']
|
232 |
source = f"[{name}]({url})"
|
233 |
-
st.markdown(
|
|
|
|
|
|
|
234 |
main()
|
|
|
2 |
import time
|
3 |
import streamlit as st
|
4 |
import logging
|
|
|
5 |
from json import JSONDecodeError
|
6 |
from markdown import markdown
|
7 |
import random
|
|
|
19 |
|
20 |
# FAISS index directory
|
21 |
INDEX_DIR = 'data/index'
|
22 |
+
QUESTIONS_PATH = 'data/questions.txt'
|
23 |
+
RETRIEVER_MODEL = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
|
24 |
+
RETRIEVER_MODEL_FORMAT = "sentence_transformers"
|
25 |
+
READER_MODEL = "deepset/roberta-base-squad2"
|
26 |
+
READER_CONFIG_THRESHOLD = 0.15
|
27 |
+
RETRIEVER_TOP_K = 10
|
28 |
+
READER_TOP_K = 5
|
29 |
# pipe=None
|
30 |
|
31 |
# the following function is cached to make index and models load only at start
|
32 |
+
|
33 |
+
|
34 |
+
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
|
35 |
+
allow_output_mutation=True)
|
36 |
def start_haystack():
|
37 |
+
"""
|
38 |
+
load document store, retriever, reader and create pipeline
|
39 |
+
"""
|
40 |
+
shutil.copy(f'{INDEX_DIR}/faiss_document_store.db', '.')
|
41 |
+
document_store = FAISSDocumentStore(
|
42 |
+
faiss_index_path=f'{INDEX_DIR}/my_faiss_index.faiss',
|
43 |
+
faiss_config_path=f'{INDEX_DIR}/my_faiss_index.json')
|
44 |
+
print(f'Index size: {document_store.get_document_count()}')
|
45 |
+
retriever = EmbeddingRetriever(
|
46 |
+
document_store=document_store,
|
47 |
+
embedding_model=RETRIEVER_MODEL,
|
48 |
+
model_format=RETRIEVER_MODEL_FORMAT
|
49 |
+
)
|
50 |
+
reader = FARMReader(model_name_or_path=READER_MODEL,
|
51 |
use_gpu=False,
|
52 |
+
confidence_threshold=READER_CONFIG_THRESHOLD)
|
53 |
+
pipe = ExtractiveQAPipeline(reader, retriever)
|
54 |
+
return pipe
|
55 |
+
|
56 |
|
57 |
@st.cache()
|
58 |
def load_questions():
|
59 |
+
with open(QUESTIONS_PATH) as fin:
|
60 |
questions = [line.strip() for line in fin.readlines()
|
61 |
+
if not line.startswith('#')]
|
62 |
+
return questions
|
63 |
+
|
64 |
|
65 |
def set_state_if_absent(key, value):
|
66 |
if key not in st.session_state:
|
67 |
st.session_state[key] = value
|
68 |
|
|
|
69 |
|
70 |
+
pipe = start_haystack()
|
71 |
+
|
72 |
+
# the pipeline is not included as parameter of the following function,
|
73 |
+
# because it is difficult to cache
|
74 |
+
|
75 |
+
|
76 |
@st.cache(persist=True, allow_output_mutation=True)
|
77 |
+
def query(question: str, retriever_top_k: int = 10, reader_top_k: int = 5):
|
78 |
"""Run query and get answers"""
|
79 |
+
params = {"Retriever": {"top_k": retriever_top_k},
|
80 |
"Reader": {"top_k": reader_top_k}}
|
81 |
results = pipe.run(question, params=params)
|
82 |
return results
|
83 |
|
84 |
|
85 |
def main():
|
86 |
+
|
|
|
87 |
questions = load_questions()
|
88 |
|
89 |
# Persistent state
|
|
|
101 |
|
102 |
# sidebar style
|
103 |
st.markdown(
|
104 |
+
"""
|
105 |
<style>
|
106 |
[data-testid="stSidebar"][aria-expanded="true"] > div:first-child{
|
107 |
width: 350px;
|
|
|
111 |
margin-left: -350px;
|
112 |
}
|
113 |
""",
|
114 |
+
unsafe_allow_html=True,
|
115 |
)
|
116 |
# Title
|
117 |
st.write("# Who killed Laura Palmer?")
|
118 |
st.write("### The first Twin Peaks Question Answering system!")
|
119 |
+
|
120 |
st.markdown("""
|
121 |
+
Ask any question about [Twin Peaks] (https://twinpeaks.fandom.com/wiki/Twin_Peaks)
|
122 |
and see if the AI ββcan find an answer...
|
123 |
|
124 |
*Note: do not use keywords, but full-fledged questions.*
|
125 |
""")
|
126 |
|
127 |
# Sidebar
|
128 |
+
st.sidebar.header("Who killed Laura Palmer?")
|
129 |
+
st.sidebar.image(
|
130 |
+
"https://upload.wikimedia.org/wikipedia/it/3/39/Twin-peaks-1990.jpg")
|
131 |
+
st.sidebar.markdown('<p align="center"><b>Twin Peaks Question Answering system</b></p>',
|
132 |
+
unsafe_allow_html=True)
|
133 |
st.sidebar.markdown(f"""
|
134 |
<style>
|
135 |
a {{
|
|
|
155 |
<div class="haystack-footer">
|
156 |
<p><a href="https://github.com/anakin87/who-killed-laura-palmer">GitHub</a> -
|
157 |
Built with <a href="https://github.com/deepset-ai/haystack/">Haystack</a><br/>
|
158 |
+
<small>Data crawled from <a href="https://twinpeaks.fandom.com/wiki/Twin_Peaks_Wiki">
|
159 |
+
Twin Peaks Wiki</a>.</small>
|
160 |
</p>
|
161 |
<img src = 'https://static.wikia.nocookie.net/twinpeaks/images/e/ef/Laura_Palmer%2C_the_Queen_Of_Hearts.jpg'/>
|
162 |
<br/>
|
|
|
167 |
st.sidebar.markdown("""
|
168 |
<p align="center">
|
169 |
<iframe style="border-radius:12px" src="https://open.spotify.com/embed/playlist/38rrtWgflrw7grB37aMlsO?utm_source=generator" width="85%" height="380" frameBorder="0" allowfullscreen="" allow="autoplay; clipboard-write; encrypted-media; fullscreen; picture-in-picture"></iframe>
|
170 |
+
</p>""", unsafe_allow_html=True)
|
171 |
|
172 |
# Search bar
|
173 |
question = st.text_input("",
|
174 |
+
value=st.session_state.question,
|
175 |
+
max_chars=100,
|
176 |
+
on_change=reset_results
|
177 |
+
)
|
178 |
col1, col2 = st.columns(2)
|
179 |
+
col1.markdown(
|
180 |
+
"<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
|
181 |
+
col2.markdown(
|
182 |
+
"<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
|
183 |
|
184 |
# Run button
|
185 |
run_pressed = col1.button("Run")
|
|
|
188 |
if col2.button("Random question"):
|
189 |
reset_results()
|
190 |
question = random.choice(questions)
|
191 |
+
# Avoid picking the same question twice (the change is not visible on the UI)
|
192 |
+
while question == st.session_state.question:
|
193 |
question = random.choice(questions)
|
194 |
st.session_state.question = question
|
|
|
195 |
st.session_state.random_question_requested = True
|
196 |
# Re-runs the script setting the random question as the textbox value
|
197 |
# Unfortunately necessary as the Random Question button is _below_ the textbox
|
198 |
+
raise st.script_runner.RerunException(
|
199 |
+
st.script_request_queue.RerunData(None))
|
200 |
else:
|
201 |
st.session_state.random_question_requested = False
|
202 |
+
|
203 |
+
run_query = (run_pressed or question != st.session_state.question) \
|
204 |
+
and not st.session_state.random_question_requested
|
205 |
|
206 |
# Get results for query
|
207 |
if run_query and question:
|
208 |
+
time_start = time.time()
|
209 |
reset_results()
|
210 |
st.session_state.question = question
|
211 |
|
|
|
214 |
|
215 |
):
|
216 |
try:
|
217 |
+
st.session_state.results = query(
|
218 |
+
question, RETRIEVER_TOP_K, READER_TOP_K)
|
219 |
+
time_end = time.time()
|
220 |
print(f'elapsed time: {time_end - time_start}')
|
221 |
except JSONDecodeError as je:
|
222 |
+
st.error(
|
223 |
+
"π An error occurred reading the results. Is the document store working?")
|
224 |
return
|
225 |
except Exception as e:
|
226 |
logging.exception(e)
|
|
|
230 |
if st.session_state.results:
|
231 |
st.write("## Results:")
|
232 |
|
233 |
+
alert_irrelevance = True
|
234 |
+
if len(st.session_state.results['answers']) == 0:
|
235 |
st.info("π€ Haystack is unsure whether any of the documents contain an answer to your question. Try to reformulate it!")
|
236 |
|
237 |
for count, result in enumerate(st.session_state.results['answers']):
|
238 |
+
result = result.to_dict()
|
239 |
if result["answer"]:
|
240 |
+
if alert_irrelevance and result['score'] < 0.50:
|
241 |
alert_irrelevance = False
|
242 |
st.write("""
|
243 |
<h4 style='color: darkred'>Attention, the
|
244 |
following answers have low relevance:</h4>""",
|
245 |
+
unsafe_allow_html=True)
|
246 |
|
247 |
answer, context = result["answer"], result["context"]
|
248 |
start_idx = context.find(answer)
|
249 |
end_idx = start_idx + len(answer)
|
250 |
# Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
|
251 |
+
st.write(markdown("- ..."+context[:start_idx] +
|
252 |
+
str(annotation(answer, "ANSWER", "#3e1c21")) + context[end_idx:]+"..."),
|
253 |
+
unsafe_allow_html=True)
|
254 |
source = ""
|
255 |
+
name = unquote(result['meta']['name']).replace('_', ' ')
|
256 |
url = result['meta']['url']
|
257 |
source = f"[{name}]({url})"
|
258 |
+
st.markdown(
|
259 |
+
f"**Score:** {result['score']:.2f} - **Source:** {source}")
|
260 |
+
|
261 |
+
|
262 |
main()
|
data/questions.txt
CHANGED
@@ -16,4 +16,11 @@ Who is the log lady?
|
|
16 |
#Who is Bobby Briggs' father?
|
17 |
who is Susan Hurley
|
18 |
Who is Mike
|
19 |
-
Why did Windom Earle goes to Twin Peaks?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
#Who is Bobby Briggs' father?
|
17 |
who is Susan Hurley
|
18 |
Who is Mike
|
19 |
+
Why did Windom Earle goes to Twin Peaks?
|
20 |
+
Who plays Laura Palmer?
|
21 |
+
Who was a Twin Peaks psychiatrist?
|
22 |
+
Who was Laura's secret boyfriend?
|
23 |
+
Who plays Bobby Briggs?
|
24 |
+
Who is the bad guy?
|
25 |
+
What does Laura die from?
|
26 |
+
Why did the movie flop in the United States?
|