semantic-demo / app.py
mrchtr's picture
Add dutch partisan news dataset
8bd9363
"""
# My first app
Here's our first attempt at using data to create a table:
"""
import streamlit as st
from retriever import do_search, dutch_datset_name, german_datset_name
def local_css(file_name):
with open(file_name) as f:
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
def render_retrieved_content(content, score):
if score is not None and score == 0.0:
return f'<blockquote> No result </blockquote>'
if score is not None:
score = round(score, 3)
print_score = f'<b> Similarity Score: {score}</b>'
return f'<blockquote> {content} </blockquote> {print_score}'
local_css('style.css')
st.header('🧐 Where my docs at?')
st.markdown('✨ Imagine you have a bunch of text documents and looking for one specific passage, '
'but you can not remember on the exact words. Just about rough content. <br><br>'
'💡 This demo compares different search approaches that can help you to find the right '
'information.', unsafe_allow_html=True)
with st.form('search-input'):
option = st.selectbox(
'Choose a dataset',
(german_datset_name, dutch_datset_name))
search = st.text_input('Enter your search query')
button = st.form_submit_button('Search')
if search:
result = do_search(search, option)
st.markdown('### 🔎 Term Frequency–Inverse Document Frequency (TF-IDF)')
st.markdown('Is a statistical approach that calculates how relevant a word is to a document '
'in your collection. Only documents will be found that contain one of the words of '
'the given search query. You still have to remember exact terms that are in the '
'searched phrase.')
st.markdown(render_retrieved_content(result[0].content, result[0].score),
unsafe_allow_html=True)
st.markdown('### 🧠 Semantic Search')
st.markdown('An alternative approach is semantic search. Instead of using words of the'
'documents to calculate the score, we use a neural network which calculates '
'sentence embeddings. Sentences and documents that are similar will be close to '
'each other in the embedding space. We use this behavior to find topic related '
'documents without knowing the exact terms. If you want learn more about this '
'topic check out one of our recent <a '
'href="https://blog.ml6.eu/decoding-sentence-encoders-37e63244ae00?source=collection_detail----1e091bbd5262-----2-----------------------">blogposts</a>.',
unsafe_allow_html=True)
st.markdown(render_retrieved_content(result[1].content, result[1].score),
unsafe_allow_html=True)
st.markdown('### 🚀 Domain Adapted Semantic Search')
st.markdown('If our document collection contains a lot of domain-specific documents, '
'we can not use standard models. These models were trained on a large amount of '
'publicly available data, which probably not covers your domain-specific words. To '
'improve the search results, we could fine-tune the network to calculate more '
'accurate similarities between queries and document regarding to your domain.')
st.markdown(render_retrieved_content(result[2].content, result[2].score),
unsafe_allow_html=True)