Spaces:

seanpedrickcase
/

Light-PDF-Web-QA-Chatbot

Running

App Files Files Community

Sean-Case commited on Aug 29, 2023

Commit

ae4a7ec

•

1 Parent(s): 41ed1b7

Updated web ingest. Added some warnings to intro text

Browse files

Files changed (3) hide show

app.py +3 -3
chatfuncs/chatfuncs.py +14 -7
chatfuncs/ingest_borough_plan.py +1 -1

app.py CHANGED Viewed

@@ -101,7 +101,7 @@ with block:
     #with gr.Row():
     gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
-    gr.Markdown("Chat with a document (alpha). By default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page (feature temporarily disabled), please select below. The chatbot will not answer questions where answered can't be found on the website.\n\nIf switching topic, please click the 'New topic' button as the bot will assume follow up questions are linked to the first. Sources are shown underneath the chat area.\n\nPlease note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
     with gr.Tab("Chatbot"):
@@ -137,11 +137,11 @@ with block:
             in_pdf = gr.File(label="Upload pdf", file_count="multiple", file_types=['.pdf'])
             load_pdf = gr.Button(value="Load in file", variant="secondary", scale=0)
-        with gr.Accordion("Web page - Temporarily disabled", open = False):
             with gr.Row():
                 in_web = gr.Textbox(label="Enter webpage url")
                 in_div = gr.Textbox(label="(Advanced) Webpage div for text extraction", value="p", placeholder="p")
-            load_web = gr.Button(value="Load in webpage", variant="secondary", scale=0, visible=False)
         ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")

     #with gr.Row():
     gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
+    gr.Markdown("Chat with a document (alpha). By default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page (feature temporarily disabled), please select below. The chatbot will not answer questions where answered can't be found on the website. If switching topic, please click the 'New topic' button as the bot will assume follow up questions are linked to the first. Sources are shown underneath the chat area.\n\nWarnings: Please ensure that the document is not sensitive is any way as other users may see it!\n\nPlease note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
     with gr.Tab("Chatbot"):
             in_pdf = gr.File(label="Upload pdf", file_count="multiple", file_types=['.pdf'])
             load_pdf = gr.Button(value="Load in file", variant="secondary", scale=0)
+        with gr.Accordion("Web page", open = False):
             with gr.Row():
                 in_web = gr.Textbox(label="Enter webpage url")
                 in_div = gr.Textbox(label="(Advanced) Webpage div for text extraction", value="p", placeholder="p")
+            load_web = gr.Button(value="Load in webpage", variant="secondary", scale=0)
         ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")

chatfuncs/chatfuncs.py CHANGED Viewed

@@ -395,13 +395,13 @@ def hybrid_retrieval(new_question_kworded, k_val, out_passages,
             return docs_keep_as_doc, doc_df, docs_keep_out
-def get_expanded_passages(vectorstore, docs_keep_out, width):
     """
     Extracts expanded passages based on given documents and a width for context.
     Parameters:
     - vectorstore: The primary data source.
-    - docs_keep_out: List of documents to be expanded.
     - width: Number of documents to expand around a given document for context.
     Returns:
@@ -436,8 +436,8 @@ def get_expanded_passages(vectorstore, docs_keep_out, width):
             for key in d1:
                 if key != "source":
                     merged[key] = str(d1[key]) + " to " + str(d2[key])
-            else:
-                merged[key] = d1[key]  # or d2[key], based on preference
             return merged
     def merge_two_lists_of_dicts(list1, list2):
@@ -446,15 +446,22 @@ def get_expanded_passages(vectorstore, docs_keep_out, width):
     vstore_docs = get_docs_from_vstore(vectorstore)
     parent_vstore_meta_section = [doc.metadata['page_section'] for _, doc in vstore_docs]
     expanded_docs = []
-    for doc, score in docs_keep_out:
         search_section = doc.metadata['page_section']
         search_index = parent_vstore_meta_section.index(search_section) if search_section in parent_vstore_meta_section else -1
         content_str, meta_first, meta_last = get_parent_content_and_meta(vstore_docs, width, search_index)
         meta_full = merge_two_lists_of_dicts(meta_first, meta_last)
-        print(meta_full)
         expanded_doc = (Document(page_content=content_str[0], metadata=meta_full[0]), score)
         expanded_docs.append(expanded_doc)
@@ -679,7 +686,7 @@ def highlight_found_text(search_text: str, full_text: str, hlt_chunk_size:int=hl
     if sorted_starts:
         current_start, current_end = sorted_starts[0], found_positions[sorted_starts[0]]
         for start in sorted_starts[1:]:
-            if start <= (current_end + 1):
                 current_end = max(current_end, found_positions[start])
             else:
                 combined_positions.append((current_start, current_end))

             return docs_keep_as_doc, doc_df, docs_keep_out
+def get_expanded_passages(vectorstore, docs, width):
     """
     Extracts expanded passages based on given documents and a width for context.
     Parameters:
     - vectorstore: The primary data source.
+    - docs: List of documents to be expanded.
     - width: Number of documents to expand around a given document for context.
     Returns:
             for key in d1:
                 if key != "source":
                     merged[key] = str(d1[key]) + " to " + str(d2[key])
+                else:
+                    merged[key] = d1[key]  # or d2[key], based on preference
             return merged
     def merge_two_lists_of_dicts(list1, list2):
     vstore_docs = get_docs_from_vstore(vectorstore)
     parent_vstore_meta_section = [doc.metadata['page_section'] for _, doc in vstore_docs]
+    print(docs)
     expanded_docs = []
+    for doc, score in docs:
         search_section = doc.metadata['page_section']
         search_index = parent_vstore_meta_section.index(search_section) if search_section in parent_vstore_meta_section else -1
         content_str, meta_first, meta_last = get_parent_content_and_meta(vstore_docs, width, search_index)
+        print("Meta first:")
+        print(meta_first)
+        print("Meta last:")
+        print(meta_last)
+        print("Meta last end.")
         meta_full = merge_two_lists_of_dicts(meta_first, meta_last)
+        #print(meta_full)
         expanded_doc = (Document(page_content=content_str[0], metadata=meta_full[0]), score)
         expanded_docs.append(expanded_doc)
     if sorted_starts:
         current_start, current_end = sorted_starts[0], found_positions[sorted_starts[0]]
         for start in sorted_starts[1:]:
+            if start <= (current_end + 10):
                 current_end = max(current_end, found_positions[start])
             else:
                 combined_positions.append((current_start, current_end))

chatfuncs/ingest_borough_plan.py CHANGED Viewed

@@ -7,7 +7,7 @@ print("Borough plan text created")
 #print(borough_plan_text)
-borough_plan_docs, borough_plan_page_docs = ing.text_to_docs(borough_plan_text)
 print("Borough plan docs created")
 embedding_model = "thenlper/gte-base"

 #print(borough_plan_text)
+borough_plan_docs = ing.text_to_docs(borough_plan_text)
 print("Borough plan docs created")
 embedding_model = "thenlper/gte-base"