Spaces:

seanpedrickcase
/

Light-PDF-Web-QA-Chatbot

Running

App Files Files Community

Sean-Case commited on Aug 29, 2023

Commit

9118536

•

1 Parent(s): ae4a7ec

Added reference to source on app. Minor text changes.

Browse files

Files changed (3) hide show

app.py +6 -4
chatfuncs/chatfuncs.py +6 -6
chatfuncs/ingest.py +20 -7

app.py CHANGED Viewed

@@ -101,7 +101,9 @@ with block:
     #with gr.Row():
     gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
-    gr.Markdown("Chat with a document (alpha). By default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page (feature temporarily disabled), please select below. The chatbot will not answer questions where answered can't be found on the website. If switching topic, please click the 'New topic' button as the bot will assume follow up questions are linked to the first. Sources are shown underneath the chat area.\n\nWarnings: Please ensure that the document is not sensitive is any way as other users may see it!\n\nPlease note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
     with gr.Tab("Chatbot"):
@@ -128,7 +130,7 @@ with block:
         )
         with gr.Row():
-            current_topic = gr.Textbox(label="Current conversation topic. If you want to talk about something else, press 'New topic'", placeholder="Keywords related to the conversation topic will appear here")
             clear = gr.Button(value="New topic", variant="secondary", scale=0)
@@ -163,13 +165,13 @@ with block:
     #    return gr.Examples.update(visible=False)
     # Load in a pdf
-    load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text]).\
              then(ing.text_to_docs, inputs=[ingest_text], outputs=[ingest_docs]).\
              then(docs_to_faiss_save, inputs=[ingest_docs], outputs=ingest_embed_out) # #then(load_embeddings, outputs=[embeddings_state]).\
              #then(hide_examples)
     # Load in a webpage
-    load_web_click = load_web.click(ing.parse_html, inputs=[in_web, in_div], outputs=[ingest_text, ingest_metadata]).\
              then(ing.html_text_to_docs, inputs=[ingest_text, ingest_metadata], outputs=[ingest_docs]).\
              then(docs_to_faiss_save, inputs=[ingest_docs], outputs=ingest_embed_out)
              #then(hide_examples)

     #with gr.Row():
     gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
+    gr.Markdown("Chat with a document (alpha). By default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page (feature temporarily disabled), please select below. The chatbot will not answer questions where answered can't be found on the website. If switching topic, please click the 'New topic' button as the bot will assume follow up questions are linked to the first. Sources are shown underneath the chat area.\n\nWarnings: This is a public app. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
+    current_source = gr.Textbox(label="Current data source that is loaded into the app", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf")
     with gr.Tab("Chatbot"):
         )
         with gr.Row():
+            current_topic = gr.Textbox(label="Keywords related to current conversation topic. If you want to talk about something else, press 'New topic'", placeholder="Keywords related to the conversation topic will appear here")
             clear = gr.Button(value="New topic", variant="secondary", scale=0)
     #    return gr.Examples.update(visible=False)
     # Load in a pdf
+    load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text, current_source]).\
              then(ing.text_to_docs, inputs=[ingest_text], outputs=[ingest_docs]).\
              then(docs_to_faiss_save, inputs=[ingest_docs], outputs=ingest_embed_out) # #then(load_embeddings, outputs=[embeddings_state]).\
              #then(hide_examples)
     # Load in a webpage
+    load_web_click = load_web.click(ing.parse_html, inputs=[in_web, in_div], outputs=[ingest_text, ingest_metadata, current_source]).\
              then(ing.html_text_to_docs, inputs=[ingest_text, ingest_metadata], outputs=[ingest_docs]).\
              then(docs_to_faiss_save, inputs=[ingest_docs], outputs=ingest_embed_out)
              #then(hide_examples)

chatfuncs/chatfuncs.py CHANGED Viewed

@@ -446,7 +446,7 @@ def get_expanded_passages(vectorstore, docs, width):
     vstore_docs = get_docs_from_vstore(vectorstore)
     parent_vstore_meta_section = [doc.metadata['page_section'] for _, doc in vstore_docs]
-    print(docs)
     expanded_docs = []
     for doc, score in docs:
@@ -454,11 +454,11 @@ def get_expanded_passages(vectorstore, docs, width):
         search_index = parent_vstore_meta_section.index(search_section) if search_section in parent_vstore_meta_section else -1
         content_str, meta_first, meta_last = get_parent_content_and_meta(vstore_docs, width, search_index)
-        print("Meta first:")
-        print(meta_first)
-        print("Meta last:")
-        print(meta_last)
-        print("Meta last end.")
         meta_full = merge_two_lists_of_dicts(meta_first, meta_last)
         #print(meta_full)

     vstore_docs = get_docs_from_vstore(vectorstore)
     parent_vstore_meta_section = [doc.metadata['page_section'] for _, doc in vstore_docs]
+    #print(docs)
     expanded_docs = []
     for doc, score in docs:
         search_index = parent_vstore_meta_section.index(search_section) if search_section in parent_vstore_meta_section else -1
         content_str, meta_first, meta_last = get_parent_content_and_meta(vstore_docs, width, search_index)
+        #print("Meta first:")
+        #print(meta_first)
+        #print("Meta last:")
+        #print(meta_last)
+        #print("Meta last end.")
         meta_full = merge_two_lists_of_dicts(meta_first, meta_last)
         #print(meta_full)

chatfuncs/ingest.py CHANGED Viewed

@@ -45,7 +45,7 @@ start_index = True
 ## Parse files
-def parse_file(file_paths, div:str = "p"):
     """
     Accepts a list of file paths, determines each file's type based on its extension,
     and passes it to the relevant parsing function.
@@ -82,6 +82,7 @@ def parse_file(file_paths, div:str = "p"):
     }
     parsed_contents = {}
     for file_path in file_paths:
         print(file_path.name)
@@ -92,8 +93,12 @@ def parse_file(file_paths, div:str = "p"):
             parsed_contents[file_path.name] = extension_to_parser[file_extension](file_path.name)
         else:
             parsed_contents[file_path.name] = f"Unsupported file type: {file_extension}"
-    return parsed_contents
 def text_regex_clean(text):
     # Merge hyphenated words
@@ -272,9 +277,16 @@ def parse_html(page_url, div_filter="p"):
     texts.append(clean_text)
     metadatas.append({"source": page_url, "date":str(date)})
-    print(metadatas)
-    return texts, metadatas
 # +
 # Convert parsed text to docs
@@ -302,10 +314,11 @@ def text_to_docs(text_dict: dict, chunk_size: int = chunk_size) -> List[Document
             print(f"Unsupported file type {ext} for {file_path}. Skipping.")
             continue
-        #match = re.search(r'.*[\/\\](.+)$', file_path)
-        match = re.search(r'(.*[\/\\])?(.+)$', file_path)
-        filename_end = match.group(2) if match else ''
         # Add filename as metadata
         for doc in docs: doc.metadata["source"] = filename_end

 ## Parse files
+def parse_file(file_paths):
     """
     Accepts a list of file paths, determines each file's type based on its extension,
     and passes it to the relevant parsing function.
     }
     parsed_contents = {}
+    file_names = []
     for file_path in file_paths:
         print(file_path.name)
             parsed_contents[file_path.name] = extension_to_parser[file_extension](file_path.name)
         else:
             parsed_contents[file_path.name] = f"Unsupported file type: {file_extension}"
+        filename_end = get_file_path_end(file_path.name)
+        file_names.append(filename_end)
+    return parsed_contents, file_names
 def text_regex_clean(text):
     # Merge hyphenated words
     texts.append(clean_text)
     metadatas.append({"source": page_url, "date":str(date)})
+    #print(metadatas)
+    return texts, metadatas, page_url
+def get_file_path_end(file_path):
+    match = re.search(r'(.*[\/\\])?(.+)$', file_path)
+    filename_end = match.group(2) if match else ''
+    return filename_end
 # +
 # Convert parsed text to docs
             print(f"Unsupported file type {ext} for {file_path}. Skipping.")
             continue
+        filename_end = get_file_path_end(file_path)
+        #match = re.search(r'(.*[\/\\])?(.+)$', file_path)
+        #filename_end = match.group(2) if match else ''
         # Add filename as metadata
         for doc in docs: doc.metadata["source"] = filename_end