Spaces:

ugaray96
/

neural-search

Runtime error

App Files Files Community

ugaray96 commited on Sep 23, 2022

Commit

c9524e4

•

2 Parent(s): 17fa846 6a6afbf

Merge pull request #9 from ugm2/fix/indexing

Browse files

Files changed (7) hide show

.streamlit/config.toml +1 -1
app.py +1 -1
core/pipelines.py +3 -4
core/search_index.py +7 -1
interface/components.py +21 -24
interface/config.py +5 -1
interface/pages.py +6 -3

.streamlit/config.toml CHANGED Viewed

@@ -1,5 +1,5 @@
 [theme]
-primaryColor="#ffbf00"
 backgroundColor="#0e1117"
 secondaryBackgroundColor="#282929"
 textColor = "#ffffff"

 [theme]
+primaryColor="#e5ab00"
 backgroundColor="#0e1117"
 secondaryBackgroundColor="#282929"
 textColor = "#ffffff"

app.py CHANGED Viewed

@@ -29,7 +29,7 @@ def run_demo():
     with navigation:
         selected_page = option_menu(
-            menu_title="Navigation",
             options=list(pages.keys()),
             icons=[f[1] for f in pages.values()],
             menu_icon="cast",

     with navigation:
         selected_page = option_menu(
+            menu_title=None,
             options=list(pages.keys()),
             icons=[f[1] for f in pages.values()],
             menu_icon="cast",

core/pipelines.py CHANGED Viewed

@@ -19,6 +19,8 @@ def keyword_search(index="documents", split_word_length=100):
       - Documents that have more lexical overlap with the query are more likely to be relevant
       - Words that occur in fewer documents are more significant than words that occur in many documents
     """
     document_store = InMemoryDocumentStore(index=index)
     keyword_retriever = TfidfRetriever(document_store=(document_store))
@@ -39,10 +41,7 @@ def keyword_search(index="documents", split_word_length=100):
     index_pipeline = Pipeline()
     index_pipeline.add_node(processor, name="Preprocessor", inputs=["File"])
     index_pipeline.add_node(
-        keyword_retriever, name="TfidfRetriever", inputs=["Preprocessor"]
-    )
-    index_pipeline.add_node(
-        document_store, name="DocumentStore", inputs=["TfidfRetriever"]
     )
     return search_pipeline, index_pipeline

       - Documents that have more lexical overlap with the query are more likely to be relevant
       - Words that occur in fewer documents are more significant than words that occur in many documents
+    :warning: **(HAYSTACK BUG) Keyword Search doesn't work if you reindex:** Please refresh page in order to reindex
     """
     document_store = InMemoryDocumentStore(index=index)
     keyword_retriever = TfidfRetriever(document_store=(document_store))
     index_pipeline = Pipeline()
     index_pipeline.add_node(processor, name="Preprocessor", inputs=["File"])
     index_pipeline.add_node(
+        document_store, name="DocumentStore", inputs=["Preprocessor"]
     )
     return search_pipeline, index_pipeline

core/search_index.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from haystack.schema import Document
 import uuid
@@ -17,8 +18,12 @@ def format_docs(documents):
     return db_docs, [doc.meta["id"] for doc in db_docs]
-def index(documents, pipeline):
     documents, doc_ids = format_docs(documents)
     pipeline.run(documents=documents)
     return doc_ids
@@ -38,6 +43,7 @@ def search(queries, pipeline):
                     "score": res.score,
                     "id": res.meta["id"],
                     "fragment_id": res.id,
                 }
             )
         if not score_is_empty:

 from haystack.schema import Document
+from haystack.document_stores import BaseDocumentStore
 import uuid
     return db_docs, [doc.meta["id"] for doc in db_docs]
+def index(documents, pipeline, clear_index=True):
     documents, doc_ids = format_docs(documents)
+    if clear_index:
+        document_stores = pipeline.get_nodes_by_class(class_type=BaseDocumentStore)
+        for docstore in document_stores:
+            docstore.delete_index(docstore.index)
     pipeline.run(documents=documents)
     return doc_ids
                     "score": res.score,
                     "id": res.meta["id"],
                     "fragment_id": res.id,
+                    "meta": res.meta,
                 }
             )
         if not score_is_empty:

interface/components.py CHANGED Viewed

@@ -42,11 +42,15 @@ def component_select_pipeline(container):
                 "index_pipeline": index_pipeline,
                 "doc": pipeline_funcs[index_pipe].__doc__,
             }
 def component_show_pipeline(pipeline, pipeline_name):
     """Draw the pipeline"""
-    with st.expander("Show pipeline"):
         if pipeline["doc"] is not None:
             st.markdown(pipeline["doc"])
         fig = get_pipeline_graph(pipeline[pipeline_name])
@@ -59,41 +63,39 @@ def component_show_search_result(container, results):
             st.markdown(f"### Match {idx+1}")
             st.markdown(f"**Text**: {document['text']}")
             st.markdown(f"**Document**: {document['id']}")
             if document["score"] is not None:
                 st.markdown(f"**Score**: {document['score']:.3f}")
             st.markdown("---")
-def component_text_input(container):
     """Draw the Text Input widget"""
     with container:
         texts = []
-        doc_id = 1
         with st.expander("Enter documents"):
             while True:
                 text = st.text_input(f"Document {doc_id}", key=doc_id)
                 if text != "":
-                    texts.append({"text": text})
                     doc_id += 1
                     st.markdown("---")
                 else:
                     break
-        corpus = [
-            {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(texts)
-        ]
-        return corpus
-def component_article_url(container):
     """Draw the Article URL widget"""
     with container:
         urls = []
-        doc_id = 1
         with st.expander("Enter URLs"):
             while True:
                 url = st.text_input(f"URL {doc_id}", key=doc_id)
                 if url != "":
-                    urls.append({"text": extract_text_from_url(url)})
                     doc_id += 1
                     st.markdown("---")
                 else:
@@ -101,19 +103,16 @@ def component_article_url(container):
         for idx, doc in enumerate(urls):
             with st.expander(f"Preview URL {idx}"):
-                st.write(doc)
-        corpus = [
-            {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
-        ]
-        return corpus
-def component_file_input(container):
     """Draw the extract text from file widget"""
     with container:
         files = []
-        doc_id = 1
         with st.expander("Enter Files"):
             while True:
                 file = st.file_uploader(
@@ -122,7 +121,7 @@ def component_file_input(container):
                 if file != None:
                     extracted_text = extract_text_from_file(file)
                     if extracted_text != None:
-                        files.append({"text": extracted_text})
                         doc_id += 1
                         st.markdown("---")
                     else:
@@ -132,9 +131,7 @@ def component_file_input(container):
         for idx, doc in enumerate(files):
             with st.expander(f"Preview File {idx}"):
-                st.write(doc)
-        corpus = [
-            {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(files)
-        ]
-        return corpus

                 "index_pipeline": index_pipeline,
                 "doc": pipeline_funcs[index_pipe].__doc__,
             }
+            st.session_state["doc_id"] = 0
 def component_show_pipeline(pipeline, pipeline_name):
     """Draw the pipeline"""
+    expander_text = "Show pipeline"
+    if pipeline["doc"] is not None and "BUG" in pipeline["doc"]:
+        expander_text += "  ⚠️"
+    with st.expander(expander_text):
         if pipeline["doc"] is not None:
             st.markdown(pipeline["doc"])
         fig = get_pipeline_graph(pipeline[pipeline_name])
             st.markdown(f"### Match {idx+1}")
             st.markdown(f"**Text**: {document['text']}")
             st.markdown(f"**Document**: {document['id']}")
+            if "_split_id" in document["meta"]:
+                st.markdown(f"**Document Chunk**: {document['meta']['_split_id']}")
             if document["score"] is not None:
                 st.markdown(f"**Score**: {document['score']:.3f}")
             st.markdown("---")
+def component_text_input(container, doc_id):
     """Draw the Text Input widget"""
     with container:
         texts = []
         with st.expander("Enter documents"):
             while True:
                 text = st.text_input(f"Document {doc_id}", key=doc_id)
                 if text != "":
+                    texts.append({"text": text, "doc_id": doc_id})
                     doc_id += 1
                     st.markdown("---")
                 else:
                     break
+        corpus = [{"text": doc["text"], "id": doc["doc_id"]} for doc in texts]
+        return corpus, doc_id
+def component_article_url(container, doc_id):
     """Draw the Article URL widget"""
     with container:
         urls = []
         with st.expander("Enter URLs"):
             while True:
                 url = st.text_input(f"URL {doc_id}", key=doc_id)
                 if url != "":
+                    urls.append({"text": extract_text_from_url(url), "doc_id": doc_id})
                     doc_id += 1
                     st.markdown("---")
                 else:
         for idx, doc in enumerate(urls):
             with st.expander(f"Preview URL {idx}"):
+                st.write(doc["text"])
+        corpus = [{"text": doc["text"], "id": doc["doc_id"]} for doc in urls]
+        return corpus, doc_id
+def component_file_input(container, doc_id):
     """Draw the extract text from file widget"""
     with container:
         files = []
         with st.expander("Enter Files"):
             while True:
                 file = st.file_uploader(
                 if file != None:
                     extracted_text = extract_text_from_file(file)
                     if extracted_text != None:
+                        files.append({"text": extracted_text, "doc_id": doc_id})
                         doc_id += 1
                         st.markdown("---")
                     else:
         for idx, doc in enumerate(files):
             with st.expander(f"Preview File {idx}"):
+                st.write(doc["text"])
+        corpus = [{"text": doc["text"], "id": doc["doc_id"]} for doc in files]
+        return corpus, doc_id

interface/config.py CHANGED Viewed

@@ -1,7 +1,11 @@
 from interface.pages import page_landing_page, page_search, page_index
 # Define default Session Variables over the whole session.
-session_state_variables = {"pipeline": None, "pipeline_func_parameters": []}
 # Define Pages for the demo
 pages = {

 from interface.pages import page_landing_page, page_search, page_index
 # Define default Session Variables over the whole session.
+session_state_variables = {
+    "pipeline": None,
+    "pipeline_func_parameters": [],
+    "doc_id": 0,
+}
 # Define Pages for the demo
 pages = {

interface/pages.py CHANGED Viewed

@@ -79,14 +79,17 @@ def page_index(container):
             orientation="horizontal",
         )
-        corpus = input_funcs[selected_input][0](container)
         if len(corpus) > 0:
             index_results = None
             if st.button("Index"):
                 index_results = index(
-                    corpus,
-                    st.session_state["pipeline"]["index_pipeline"],
                 )
             if index_results:
                 st.write(index_results)

             orientation="horizontal",
         )
+        clear_index = st.sidebar.checkbox("Clear Index", True)
+        doc_id = st.session_state["doc_id"]
+        corpus, doc_id = input_funcs[selected_input][0](container, doc_id)
         if len(corpus) > 0:
             index_results = None
             if st.button("Index"):
                 index_results = index(
+                    corpus, st.session_state["pipeline"]["index_pipeline"], clear_index
                 )
+                st.session_state["doc_id"] = doc_id
             if index_results:
                 st.write(index_results)