Spaces:

mrchtr
/

semantic-demo

Runtime error

App Files Files Community

mrchtr commited on Jun 24, 2022

Commit

181e8c5

•

1 Parent(s): 01628bb

Fixing typos

Browse files

Files changed (3) hide show

app.py +25 -18
documentstore_german-election-idx_adapted.pkl +3 -0
retriever.py +12 -6

app.py CHANGED Viewed

@@ -11,7 +11,8 @@ def local_css(file_name):
 def render_retrieved_content(content, score):
-    print_score = ''
     if score is not None:
         score = round(score, 3)
         print_score = f'<b> Similarity Score: {score}</b>'
@@ -24,40 +25,46 @@ st.markdown('✨ Imagine you have a bunch of text documents and looking for one
          '💡 This demo compares different search approaches that can help you to find the right '
          'information.', unsafe_allow_html=True)
-option = st.selectbox(
-     'Choose a dataset',
-     ('CDU election program 2021', 'Partisan news 2019 (dutch)'))
-search = st.text_input('Enter your search query')
 if search:
     result = do_search(search)
     st.markdown('### 🔎  Term Frequency–Inverse Document Frequency (TF-IDF)')
     st.markdown('Is a statistical approach that calculates how relevant a word is to a document '
                 'in your collection. Only documents will be found that contain one of the words of '
-                'the given search query. You still have to remember on exact terms that are in the'
                 'searched phrase.')
-    st.markdown(render_retrieved_content(result[0][0].content, None),
                 unsafe_allow_html=True)
-    st.markdown('### 🧠  Semantic search')
-    st.markdown('An alternative approach is semantic search. Instead of using words of the '
-                'documents to calculate the score, we use a neural network that calculate the '
-                'similarity between the query and the documents of the collection. In other words, '
-                'the chance is high to find topic related documents without knowing the exact '
-                'terms.')
-    st.markdown(render_retrieved_content(result[1][0].content, result[1][0].score),
                 unsafe_allow_html=True)
-    st.markdown('### 🚀  Domain adapted semantic search')
-    st.markdown('If our document collection contains a lot of domain specific documents, '
                 'we can not use standard models. These models were trained on a large amount of '
-                'public available data, that covers probably not your domain specific words. To '
                 'improve the search results, we could fine-tune the network to calculate more '
                 'accurate similarities between queries and document regarding to your domain.')
-    st.markdown(render_retrieved_content(result[2][0].content, result[2][0].score),
                 unsafe_allow_html=True)

 def render_retrieved_content(content, score):
+    if score is not None and score == 0.0:
+        return f'<blockquote> No result </blockquote>'
     if score is not None:
         score = round(score, 3)
         print_score = f'<b> Similarity Score: {score}</b>'
          '💡 This demo compares different search approaches that can help you to find the right '
          'information.', unsafe_allow_html=True)
+with st.form('search-input'):
+    option = st.selectbox(
+        'Choose a dataset',
+        ('CDU election program 2021', 'Partisan news 2019 (dutch)'))
+    search = st.text_input('Enter your search query')
+    button = st.form_submit_button('Search')
 if search:
     result = do_search(search)
     st.markdown('### 🔎  Term Frequency–Inverse Document Frequency (TF-IDF)')
     st.markdown('Is a statistical approach that calculates how relevant a word is to a document '
                 'in your collection. Only documents will be found that contain one of the words of '
+                'the given search query. You still have to remember exact terms that are in the '
                 'searched phrase.')
+    st.markdown(render_retrieved_content(result[0].content, result[0].score),
                 unsafe_allow_html=True)
+    st.markdown('### 🧠  Semantic Search')
+    st.markdown('An alternative approach is semantic search. Instead of using words of the'
+                'documents to calculate the score, we use a neural network which calculates '
+                'sentence embeddings. Sentences and documents that are similar will be close to '
+                'each other in the embedding space. We use this behavior to find topic related '
+                'documents without knowing the exact terms. If you want learn more about this '
+                'topic check out one of our recent <a '
+                'href="https://blog.ml6.eu/decoding-sentence-encoders-37e63244ae00?source=collection_detail----1e091bbd5262-----2-----------------------">blogposts</a>.',
+                unsafe_allow_html=True)
+    st.markdown(render_retrieved_content(result[1].content, result[1].score),
                 unsafe_allow_html=True)
+    st.markdown('### 🚀  Domain Adapted Semantic Search')
+    st.markdown('If our document collection contains a lot of domain-specific documents, '
                 'we can not use standard models. These models were trained on a large amount of '
+                'publicly available data, which probably not covers your domain-specific words. To '
                 'improve the search results, we could fine-tune the network to calculate more '
                 'accurate similarities between queries and document regarding to your domain.')
+    st.markdown(render_retrieved_content(result[2].content, result[2].score),
                 unsafe_allow_html=True)

documentstore_german-election-idx_adapted.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:240da0dc8d623928b064900b3c1525e785aefb5cb07a471171d1af2aae0704c8
+size 4874683

retriever.py CHANGED Viewed

@@ -26,6 +26,9 @@ class ExportableInMemoryDocumentStore(InMemoryDocumentStore):
 document_store = ExportableInMemoryDocumentStore(similarity='cosine')
 document_store.load_data('documentstore_german-election-idx.pkl')
 retriever = TfidfRetriever(document_store=document_store)
 base_dense_retriever = EmbeddingRetriever(
         document_store=document_store,
@@ -34,15 +37,18 @@ base_dense_retriever = EmbeddingRetriever(
     )
 fine_tuned_retriever = EmbeddingRetriever(
-        document_store=document_store,
         embedding_model='./adapted-retriever',
         model_format='sentence_transformers'
 )
 def sparse_retrieval(query):
     """Sparse retrieval pipeline"""
     p_retrieval = DocumentSearchPipeline(retriever)
-    return p_retrieval.run(query=query)
 def dense_retrieval(query, retriever='base'):
     if retriever == 'base':
@@ -56,13 +62,13 @@ def dense_retrieval(query, retriever='base'):
 def do_search(query):
-    sparse_result = sparse_retrieval(query)['documents']
-    dense_base_result =dense_retrieval(query, retriever='base')['documents']
-    dense_adapted_result = dense_retrieval(query, retriever='adapted')['documents']
     return sparse_result, dense_base_result, dense_adapted_result
 if __name__ == '__main__':
-    query = 'Klimawandel stoppen?'
     result = do_search(query)
     pprint(result)

 document_store = ExportableInMemoryDocumentStore(similarity='cosine')
 document_store.load_data('documentstore_german-election-idx.pkl')
+document_store_adapted = ExportableInMemoryDocumentStore(similarity='cosine')
+document_store_adapted.load_data('documentstore_german-election-idx.pkl')
 retriever = TfidfRetriever(document_store=document_store)
 base_dense_retriever = EmbeddingRetriever(
         document_store=document_store,
     )
 fine_tuned_retriever = EmbeddingRetriever(
+        document_store=document_store_adapted,
         embedding_model='./adapted-retriever',
         model_format='sentence_transformers'
 )
 def sparse_retrieval(query):
     """Sparse retrieval pipeline"""
+    scores = retriever._calc_scores(query)
     p_retrieval = DocumentSearchPipeline(retriever)
+    documents = p_retrieval.run(query=query)
+    documents['documents'][0].score = list(scores[0].values())[0]
+    return documents
 def dense_retrieval(query, retriever='base'):
     if retriever == 'base':
 def do_search(query):
+    sparse_result = sparse_retrieval(query)['documents'][0]
+    dense_base_result =dense_retrieval(query, retriever='base')['documents'][0]
+    dense_adapted_result = dense_retrieval(query, retriever='adapted')['documents'][0]
     return sparse_result, dense_base_result, dense_adapted_result
 if __name__ == '__main__':
+    query = 'Frauen'
     result = do_search(query)
     pprint(result)