Sean-Case commited on
Commit
ae4a7ec
1 Parent(s): 41ed1b7

Updated web ingest. Added some warnings to intro text

Browse files
app.py CHANGED
@@ -101,7 +101,7 @@ with block:
101
  #with gr.Row():
102
  gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
103
 
104
- gr.Markdown("Chat with a document (alpha). By default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page (feature temporarily disabled), please select below. The chatbot will not answer questions where answered can't be found on the website.\n\nIf switching topic, please click the 'New topic' button as the bot will assume follow up questions are linked to the first. Sources are shown underneath the chat area.\n\nPlease note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
105
 
106
  with gr.Tab("Chatbot"):
107
 
@@ -137,11 +137,11 @@ with block:
137
  in_pdf = gr.File(label="Upload pdf", file_count="multiple", file_types=['.pdf'])
138
  load_pdf = gr.Button(value="Load in file", variant="secondary", scale=0)
139
 
140
- with gr.Accordion("Web page - Temporarily disabled", open = False):
141
  with gr.Row():
142
  in_web = gr.Textbox(label="Enter webpage url")
143
  in_div = gr.Textbox(label="(Advanced) Webpage div for text extraction", value="p", placeholder="p")
144
- load_web = gr.Button(value="Load in webpage", variant="secondary", scale=0, visible=False)
145
 
146
  ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
147
 
 
101
  #with gr.Row():
102
  gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
103
 
104
+ gr.Markdown("Chat with a document (alpha). By default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page (feature temporarily disabled), please select below. The chatbot will not answer questions where answered can't be found on the website. If switching topic, please click the 'New topic' button as the bot will assume follow up questions are linked to the first. Sources are shown underneath the chat area.\n\nWarnings: Please ensure that the document is not sensitive is any way as other users may see it!\n\nPlease note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
105
 
106
  with gr.Tab("Chatbot"):
107
 
 
137
  in_pdf = gr.File(label="Upload pdf", file_count="multiple", file_types=['.pdf'])
138
  load_pdf = gr.Button(value="Load in file", variant="secondary", scale=0)
139
 
140
+ with gr.Accordion("Web page", open = False):
141
  with gr.Row():
142
  in_web = gr.Textbox(label="Enter webpage url")
143
  in_div = gr.Textbox(label="(Advanced) Webpage div for text extraction", value="p", placeholder="p")
144
+ load_web = gr.Button(value="Load in webpage", variant="secondary", scale=0)
145
 
146
  ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
147
 
chatfuncs/chatfuncs.py CHANGED
@@ -395,13 +395,13 @@ def hybrid_retrieval(new_question_kworded, k_val, out_passages,
395
 
396
  return docs_keep_as_doc, doc_df, docs_keep_out
397
 
398
- def get_expanded_passages(vectorstore, docs_keep_out, width):
399
  """
400
  Extracts expanded passages based on given documents and a width for context.
401
 
402
  Parameters:
403
  - vectorstore: The primary data source.
404
- - docs_keep_out: List of documents to be expanded.
405
  - width: Number of documents to expand around a given document for context.
406
 
407
  Returns:
@@ -436,8 +436,8 @@ def get_expanded_passages(vectorstore, docs_keep_out, width):
436
  for key in d1:
437
  if key != "source":
438
  merged[key] = str(d1[key]) + " to " + str(d2[key])
439
- else:
440
- merged[key] = d1[key] # or d2[key], based on preference
441
  return merged
442
 
443
  def merge_two_lists_of_dicts(list1, list2):
@@ -446,15 +446,22 @@ def get_expanded_passages(vectorstore, docs_keep_out, width):
446
  vstore_docs = get_docs_from_vstore(vectorstore)
447
  parent_vstore_meta_section = [doc.metadata['page_section'] for _, doc in vstore_docs]
448
 
 
 
449
  expanded_docs = []
450
- for doc, score in docs_keep_out:
451
  search_section = doc.metadata['page_section']
452
  search_index = parent_vstore_meta_section.index(search_section) if search_section in parent_vstore_meta_section else -1
453
 
454
  content_str, meta_first, meta_last = get_parent_content_and_meta(vstore_docs, width, search_index)
 
 
 
 
 
455
  meta_full = merge_two_lists_of_dicts(meta_first, meta_last)
456
 
457
- print(meta_full)
458
 
459
  expanded_doc = (Document(page_content=content_str[0], metadata=meta_full[0]), score)
460
  expanded_docs.append(expanded_doc)
@@ -679,7 +686,7 @@ def highlight_found_text(search_text: str, full_text: str, hlt_chunk_size:int=hl
679
  if sorted_starts:
680
  current_start, current_end = sorted_starts[0], found_positions[sorted_starts[0]]
681
  for start in sorted_starts[1:]:
682
- if start <= (current_end + 1):
683
  current_end = max(current_end, found_positions[start])
684
  else:
685
  combined_positions.append((current_start, current_end))
 
395
 
396
  return docs_keep_as_doc, doc_df, docs_keep_out
397
 
398
+ def get_expanded_passages(vectorstore, docs, width):
399
  """
400
  Extracts expanded passages based on given documents and a width for context.
401
 
402
  Parameters:
403
  - vectorstore: The primary data source.
404
+ - docs: List of documents to be expanded.
405
  - width: Number of documents to expand around a given document for context.
406
 
407
  Returns:
 
436
  for key in d1:
437
  if key != "source":
438
  merged[key] = str(d1[key]) + " to " + str(d2[key])
439
+ else:
440
+ merged[key] = d1[key] # or d2[key], based on preference
441
  return merged
442
 
443
  def merge_two_lists_of_dicts(list1, list2):
 
446
  vstore_docs = get_docs_from_vstore(vectorstore)
447
  parent_vstore_meta_section = [doc.metadata['page_section'] for _, doc in vstore_docs]
448
 
449
+ print(docs)
450
+
451
  expanded_docs = []
452
+ for doc, score in docs:
453
  search_section = doc.metadata['page_section']
454
  search_index = parent_vstore_meta_section.index(search_section) if search_section in parent_vstore_meta_section else -1
455
 
456
  content_str, meta_first, meta_last = get_parent_content_and_meta(vstore_docs, width, search_index)
457
+ print("Meta first:")
458
+ print(meta_first)
459
+ print("Meta last:")
460
+ print(meta_last)
461
+ print("Meta last end.")
462
  meta_full = merge_two_lists_of_dicts(meta_first, meta_last)
463
 
464
+ #print(meta_full)
465
 
466
  expanded_doc = (Document(page_content=content_str[0], metadata=meta_full[0]), score)
467
  expanded_docs.append(expanded_doc)
 
686
  if sorted_starts:
687
  current_start, current_end = sorted_starts[0], found_positions[sorted_starts[0]]
688
  for start in sorted_starts[1:]:
689
+ if start <= (current_end + 10):
690
  current_end = max(current_end, found_positions[start])
691
  else:
692
  combined_positions.append((current_start, current_end))
chatfuncs/ingest_borough_plan.py CHANGED
@@ -7,7 +7,7 @@ print("Borough plan text created")
7
 
8
  #print(borough_plan_text)
9
 
10
- borough_plan_docs, borough_plan_page_docs = ing.text_to_docs(borough_plan_text)
11
  print("Borough plan docs created")
12
 
13
  embedding_model = "thenlper/gte-base"
 
7
 
8
  #print(borough_plan_text)
9
 
10
+ borough_plan_docs = ing.text_to_docs(borough_plan_text)
11
  print("Borough plan docs created")
12
 
13
  embedding_model = "thenlper/gte-base"