Spaces:
Runtime error
Runtime error
Update styles
Browse files- app.py +46 -14
- retriever.py +3 -3
- style.css +18 -0
app.py
CHANGED
@@ -3,30 +3,62 @@
|
|
3 |
Here's our first attempt at using data to create a table:
|
4 |
"""
|
5 |
import streamlit as st
|
6 |
-
import pandas as pd
|
7 |
-
from load_css import local_css
|
8 |
from retriever import do_search
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
local_css('style.css')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
st.header('Semantic search demo')
|
13 |
-
search = st.text_input('')
|
14 |
|
|
|
15 |
if search:
|
16 |
result = do_search(search)
|
17 |
-
col1, col2, col3 = st.columns(3)
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
|
32 |
|
|
|
3 |
Here's our first attempt at using data to create a table:
|
4 |
"""
|
5 |
import streamlit as st
|
|
|
|
|
6 |
from retriever import do_search
|
7 |
|
8 |
+
def local_css(file_name):
|
9 |
+
with open(file_name) as f:
|
10 |
+
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
|
11 |
+
|
12 |
+
|
13 |
+
def render_retrieved_content(content, score):
|
14 |
+
print_score = ''
|
15 |
+
if score is not None:
|
16 |
+
score = round(score, 3)
|
17 |
+
print_score = f'<b> Similarity Score: {score}</b>'
|
18 |
+
return f'<blockquote>{content} </blockquote> {print_score}'
|
19 |
+
|
20 |
local_css('style.css')
|
21 |
+
st.header('🧐 Where my docs at?')
|
22 |
+
st.markdown('✨ Imagine you have a bunch of text documents and looking for one specific passage, '
|
23 |
+
'but you can not remember on the exact words. Just about rough content. <br><br>'
|
24 |
+
'💡 This demo compares different search approaches that can help you to find the right '
|
25 |
+
'information.', unsafe_allow_html=True)
|
26 |
+
|
27 |
+
option = st.selectbox(
|
28 |
+
'Choose a dataset',
|
29 |
+
('CDU election program 2021', 'Partisan news 2019 (dutch)'))
|
30 |
+
|
31 |
|
|
|
|
|
32 |
|
33 |
+
search = st.text_input('Enter your search query')
|
34 |
if search:
|
35 |
result = do_search(search)
|
|
|
36 |
|
37 |
+
st.markdown('### 🔎 Term Frequency–Inverse Document Frequency (TF-IDF)')
|
38 |
+
st.markdown('Is a statistical approach that calculates how relevant a word is to a document '
|
39 |
+
'in your collection. Only documents will be found that contain one of the words of '
|
40 |
+
'the given search query. You still have to remember on exact terms that are in the'
|
41 |
+
'searched phrase.')
|
42 |
+
st.markdown(render_retrieved_content(result[0][0].content, None),
|
43 |
+
unsafe_allow_html=True)
|
44 |
|
45 |
+
st.markdown('### 🧠 Semantic search')
|
46 |
+
st.markdown('An alternative approach is semantic search. Instead of using words of the '
|
47 |
+
'documents to calculate the score, we use a neural network that calculate the '
|
48 |
+
'similarity between the query and the documents of the collection. In other words, '
|
49 |
+
'the chance is high to find topic related documents without knowing the exact '
|
50 |
+
'terms.')
|
51 |
+
st.markdown(render_retrieved_content(result[1][0].content, result[1][0].score),
|
52 |
+
unsafe_allow_html=True)
|
53 |
|
54 |
+
st.markdown('### 🚀 Domain adapted semantic search')
|
55 |
+
st.markdown('If our document collection contains a lot of domain specific documents, '
|
56 |
+
'we can not use standard models. These models were trained on a large amount of '
|
57 |
+
'public available data, that covers probably not your domain specific words. To '
|
58 |
+
'improve the search results, we could fine-tune the network to calculate more '
|
59 |
+
'accurate similarities between queries and document regarding to your domain.')
|
60 |
+
st.markdown(render_retrieved_content(result[2][0].content, result[2][0].score),
|
61 |
+
unsafe_allow_html=True)
|
62 |
|
63 |
|
64 |
|
retriever.py
CHANGED
@@ -56,9 +56,9 @@ def dense_retrieval(query, retriever='base'):
|
|
56 |
|
57 |
|
58 |
def do_search(query):
|
59 |
-
sparse_result = sparse_retrieval(query)['documents']
|
60 |
-
dense_base_result =
|
61 |
-
dense_adapted_result = dense_retrieval(query, retriever='adapted')['documents']
|
62 |
return sparse_result, dense_base_result, dense_adapted_result
|
63 |
|
64 |
if __name__ == '__main__':
|
|
|
56 |
|
57 |
|
58 |
def do_search(query):
|
59 |
+
sparse_result = sparse_retrieval(query)['documents']
|
60 |
+
dense_base_result =dense_retrieval(query, retriever='base')['documents']
|
61 |
+
dense_adapted_result = dense_retrieval(query, retriever='adapted')['documents']
|
62 |
return sparse_result, dense_base_result, dense_adapted_result
|
63 |
|
64 |
if __name__ == '__main__':
|
style.css
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
blockquote {
|
2 |
+
background: #f9f9f9;
|
3 |
+
border-left: 10px solid #ccc;
|
4 |
+
margin: 1.5em 10px;
|
5 |
+
padding: 0.5em 10px;
|
6 |
+
quotes: "\201C""\201D""\2018""\2019";
|
7 |
+
}
|
8 |
+
blockquote:before {
|
9 |
+
color: #ccc;
|
10 |
+
content: '';
|
11 |
+
font-size: 4em;
|
12 |
+
line-height: 0.1em;
|
13 |
+
margin-right: 0.25em;
|
14 |
+
vertical-align: -0.4em;
|
15 |
+
}
|
16 |
+
blockquote p {
|
17 |
+
display: inline;
|
18 |
+
}
|