Sean-Case commited on
Commit
9118536
1 Parent(s): ae4a7ec

Added reference to source on app. Minor text changes.

Browse files
Files changed (3) hide show
  1. app.py +6 -4
  2. chatfuncs/chatfuncs.py +6 -6
  3. chatfuncs/ingest.py +20 -7
app.py CHANGED
@@ -101,7 +101,9 @@ with block:
101
  #with gr.Row():
102
  gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
103
 
104
- gr.Markdown("Chat with a document (alpha). By default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page (feature temporarily disabled), please select below. The chatbot will not answer questions where answered can't be found on the website. If switching topic, please click the 'New topic' button as the bot will assume follow up questions are linked to the first. Sources are shown underneath the chat area.\n\nWarnings: Please ensure that the document is not sensitive is any way as other users may see it!\n\nPlease note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
 
 
105
 
106
  with gr.Tab("Chatbot"):
107
 
@@ -128,7 +130,7 @@ with block:
128
  )
129
 
130
  with gr.Row():
131
- current_topic = gr.Textbox(label="Current conversation topic. If you want to talk about something else, press 'New topic'", placeholder="Keywords related to the conversation topic will appear here")
132
  clear = gr.Button(value="New topic", variant="secondary", scale=0)
133
 
134
 
@@ -163,13 +165,13 @@ with block:
163
  # return gr.Examples.update(visible=False)
164
 
165
  # Load in a pdf
166
- load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text]).\
167
  then(ing.text_to_docs, inputs=[ingest_text], outputs=[ingest_docs]).\
168
  then(docs_to_faiss_save, inputs=[ingest_docs], outputs=ingest_embed_out) # #then(load_embeddings, outputs=[embeddings_state]).\
169
  #then(hide_examples)
170
 
171
  # Load in a webpage
172
- load_web_click = load_web.click(ing.parse_html, inputs=[in_web, in_div], outputs=[ingest_text, ingest_metadata]).\
173
  then(ing.html_text_to_docs, inputs=[ingest_text, ingest_metadata], outputs=[ingest_docs]).\
174
  then(docs_to_faiss_save, inputs=[ingest_docs], outputs=ingest_embed_out)
175
  #then(hide_examples)
 
101
  #with gr.Row():
102
  gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
103
 
104
+ gr.Markdown("Chat with a document (alpha). By default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page (feature temporarily disabled), please select below. The chatbot will not answer questions where answered can't be found on the website. If switching topic, please click the 'New topic' button as the bot will assume follow up questions are linked to the first. Sources are shown underneath the chat area.\n\nWarnings: This is a public app. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
105
+
106
+ current_source = gr.Textbox(label="Current data source that is loaded into the app", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf")
107
 
108
  with gr.Tab("Chatbot"):
109
 
 
130
  )
131
 
132
  with gr.Row():
133
+ current_topic = gr.Textbox(label="Keywords related to current conversation topic. If you want to talk about something else, press 'New topic'", placeholder="Keywords related to the conversation topic will appear here")
134
  clear = gr.Button(value="New topic", variant="secondary", scale=0)
135
 
136
 
 
165
  # return gr.Examples.update(visible=False)
166
 
167
  # Load in a pdf
168
+ load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text, current_source]).\
169
  then(ing.text_to_docs, inputs=[ingest_text], outputs=[ingest_docs]).\
170
  then(docs_to_faiss_save, inputs=[ingest_docs], outputs=ingest_embed_out) # #then(load_embeddings, outputs=[embeddings_state]).\
171
  #then(hide_examples)
172
 
173
  # Load in a webpage
174
+ load_web_click = load_web.click(ing.parse_html, inputs=[in_web, in_div], outputs=[ingest_text, ingest_metadata, current_source]).\
175
  then(ing.html_text_to_docs, inputs=[ingest_text, ingest_metadata], outputs=[ingest_docs]).\
176
  then(docs_to_faiss_save, inputs=[ingest_docs], outputs=ingest_embed_out)
177
  #then(hide_examples)
chatfuncs/chatfuncs.py CHANGED
@@ -446,7 +446,7 @@ def get_expanded_passages(vectorstore, docs, width):
446
  vstore_docs = get_docs_from_vstore(vectorstore)
447
  parent_vstore_meta_section = [doc.metadata['page_section'] for _, doc in vstore_docs]
448
 
449
- print(docs)
450
 
451
  expanded_docs = []
452
  for doc, score in docs:
@@ -454,11 +454,11 @@ def get_expanded_passages(vectorstore, docs, width):
454
  search_index = parent_vstore_meta_section.index(search_section) if search_section in parent_vstore_meta_section else -1
455
 
456
  content_str, meta_first, meta_last = get_parent_content_and_meta(vstore_docs, width, search_index)
457
- print("Meta first:")
458
- print(meta_first)
459
- print("Meta last:")
460
- print(meta_last)
461
- print("Meta last end.")
462
  meta_full = merge_two_lists_of_dicts(meta_first, meta_last)
463
 
464
  #print(meta_full)
 
446
  vstore_docs = get_docs_from_vstore(vectorstore)
447
  parent_vstore_meta_section = [doc.metadata['page_section'] for _, doc in vstore_docs]
448
 
449
+ #print(docs)
450
 
451
  expanded_docs = []
452
  for doc, score in docs:
 
454
  search_index = parent_vstore_meta_section.index(search_section) if search_section in parent_vstore_meta_section else -1
455
 
456
  content_str, meta_first, meta_last = get_parent_content_and_meta(vstore_docs, width, search_index)
457
+ #print("Meta first:")
458
+ #print(meta_first)
459
+ #print("Meta last:")
460
+ #print(meta_last)
461
+ #print("Meta last end.")
462
  meta_full = merge_two_lists_of_dicts(meta_first, meta_last)
463
 
464
  #print(meta_full)
chatfuncs/ingest.py CHANGED
@@ -45,7 +45,7 @@ start_index = True
45
 
46
  ## Parse files
47
 
48
- def parse_file(file_paths, div:str = "p"):
49
  """
50
  Accepts a list of file paths, determines each file's type based on its extension,
51
  and passes it to the relevant parsing function.
@@ -82,6 +82,7 @@ def parse_file(file_paths, div:str = "p"):
82
  }
83
 
84
  parsed_contents = {}
 
85
 
86
  for file_path in file_paths:
87
  print(file_path.name)
@@ -92,8 +93,12 @@ def parse_file(file_paths, div:str = "p"):
92
  parsed_contents[file_path.name] = extension_to_parser[file_extension](file_path.name)
93
  else:
94
  parsed_contents[file_path.name] = f"Unsupported file type: {file_extension}"
 
 
 
 
95
 
96
- return parsed_contents
97
 
98
  def text_regex_clean(text):
99
  # Merge hyphenated words
@@ -272,9 +277,16 @@ def parse_html(page_url, div_filter="p"):
272
  texts.append(clean_text)
273
  metadatas.append({"source": page_url, "date":str(date)})
274
 
275
- print(metadatas)
276
 
277
- return texts, metadatas
 
 
 
 
 
 
 
278
 
279
  # +
280
  # Convert parsed text to docs
@@ -302,10 +314,11 @@ def text_to_docs(text_dict: dict, chunk_size: int = chunk_size) -> List[Document
302
  print(f"Unsupported file type {ext} for {file_path}. Skipping.")
303
  continue
304
 
305
- #match = re.search(r'.*[\/\\](.+)$', file_path)
306
- match = re.search(r'(.*[\/\\])?(.+)$', file_path)
307
 
308
- filename_end = match.group(2) if match else ''
 
 
 
309
 
310
  # Add filename as metadata
311
  for doc in docs: doc.metadata["source"] = filename_end
 
45
 
46
  ## Parse files
47
 
48
+ def parse_file(file_paths):
49
  """
50
  Accepts a list of file paths, determines each file's type based on its extension,
51
  and passes it to the relevant parsing function.
 
82
  }
83
 
84
  parsed_contents = {}
85
+ file_names = []
86
 
87
  for file_path in file_paths:
88
  print(file_path.name)
 
93
  parsed_contents[file_path.name] = extension_to_parser[file_extension](file_path.name)
94
  else:
95
  parsed_contents[file_path.name] = f"Unsupported file type: {file_extension}"
96
+
97
+ filename_end = get_file_path_end(file_path.name)
98
+
99
+ file_names.append(filename_end)
100
 
101
+ return parsed_contents, file_names
102
 
103
  def text_regex_clean(text):
104
  # Merge hyphenated words
 
277
  texts.append(clean_text)
278
  metadatas.append({"source": page_url, "date":str(date)})
279
 
280
+ #print(metadatas)
281
 
282
+ return texts, metadatas, page_url
283
+
284
+ def get_file_path_end(file_path):
285
+ match = re.search(r'(.*[\/\\])?(.+)$', file_path)
286
+
287
+ filename_end = match.group(2) if match else ''
288
+
289
+ return filename_end
290
 
291
  # +
292
  # Convert parsed text to docs
 
314
  print(f"Unsupported file type {ext} for {file_path}. Skipping.")
315
  continue
316
 
 
 
317
 
318
+ filename_end = get_file_path_end(file_path)
319
+
320
+ #match = re.search(r'(.*[\/\\])?(.+)$', file_path)
321
+ #filename_end = match.group(2) if match else ''
322
 
323
  # Add filename as metadata
324
  for doc in docs: doc.metadata["source"] = filename_end