Spaces:

ml6team
/

semantic-search-demo

Runtime error

File size: 3,432 Bytes

"""
# My first app
Here's our first attempt at using data to create a table:
"""
import streamlit as st
from retriever import do_search, dutch_datset_name, german_datset_name


def local_css(file_name):
    with open(file_name) as f:
        st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)


def render_retrieved_content(content, score):
    if score is not None and score == 0.0:
        return f'<blockquote> No result </blockquote>'
    if score is not None:
        score = round(score, 3)
        print_score = f'<b> Similarity Score: {score}</b>'
    return f'<blockquote> {content} </blockquote> {print_score}'

local_css('style.css')
st.header('🧐  Where my docs at?')
st.markdown('✨ Imagine you have a bunch of text documents and looking for one specific passage, '
         'but you can not remember on the exact words. Just about rough content. <br><br>'
         '💡 This demo compares different search approaches that can help you to find the right '
         'information.', unsafe_allow_html=True)




with st.form('search-input'):
    option = st.selectbox(
        'Choose a dataset',
        (german_datset_name, dutch_datset_name))
    search = st.text_input('Enter your search query')
    button = st.form_submit_button('Search')

if search:
    result = do_search(search, option)

    st.markdown('### 🔎  Term Frequency–Inverse Document Frequency (TF-IDF)')
    st.markdown('Is a statistical approach that calculates how relevant a word is to a document '
                'in your collection. Only documents will be found that contain one of the words of '
                'the given search query. You still have to remember exact terms that are in the '
                'searched phrase.')
    st.markdown(render_retrieved_content(result[0].content, result[0].score),
                unsafe_allow_html=True)

    st.markdown('### 🧠  Semantic Search')
    st.markdown('An alternative approach is semantic search. Instead of using words of the' 
                'documents to calculate the score, we use a neural network which calculates '
                'sentence embeddings. Sentences and documents that are similar will be close to '
                'each other in the embedding space. We use this behavior to find topic related '
                'documents without knowing the exact terms. If you want learn more about this '
                'topic check out one of our recent <a '
                'href="https://blog.ml6.eu/decoding-sentence-encoders-37e63244ae00?source=collection_detail----1e091bbd5262-----2-----------------------">blogposts</a>.',
                unsafe_allow_html=True)
    st.markdown(render_retrieved_content(result[1].content, result[1].score),
                unsafe_allow_html=True)

    st.markdown('### 🚀  Domain Adapted Semantic Search')
    st.markdown('If our document collection contains a lot of domain-specific documents, '
                'we can not use standard models. These models were trained on a large amount of '
                'publicly available data, which probably not covers your domain-specific words. To '
                'improve the search results, we could fine-tune the network to calculate more '
                'accurate similarities between queries and document regarding to your domain.')
    st.markdown(render_retrieved_content(result[2].content, result[2].score),
                unsafe_allow_html=True)