Sean-Case
commited on
Commit
•
ae4a7ec
1
Parent(s):
41ed1b7
Updated web ingest. Added some warnings to intro text
Browse files- app.py +3 -3
- chatfuncs/chatfuncs.py +14 -7
- chatfuncs/ingest_borough_plan.py +1 -1
app.py
CHANGED
@@ -101,7 +101,7 @@ with block:
|
|
101 |
#with gr.Row():
|
102 |
gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
|
103 |
|
104 |
-
gr.Markdown("Chat with a document (alpha). By default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page (feature temporarily disabled), please select below. The chatbot will not answer questions where answered can't be found on the website
|
105 |
|
106 |
with gr.Tab("Chatbot"):
|
107 |
|
@@ -137,11 +137,11 @@ with block:
|
|
137 |
in_pdf = gr.File(label="Upload pdf", file_count="multiple", file_types=['.pdf'])
|
138 |
load_pdf = gr.Button(value="Load in file", variant="secondary", scale=0)
|
139 |
|
140 |
-
with gr.Accordion("Web page
|
141 |
with gr.Row():
|
142 |
in_web = gr.Textbox(label="Enter webpage url")
|
143 |
in_div = gr.Textbox(label="(Advanced) Webpage div for text extraction", value="p", placeholder="p")
|
144 |
-
load_web = gr.Button(value="Load in webpage", variant="secondary", scale=0
|
145 |
|
146 |
ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
|
147 |
|
|
|
101 |
#with gr.Row():
|
102 |
gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
|
103 |
|
104 |
+
gr.Markdown("Chat with a document (alpha). By default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page (feature temporarily disabled), please select below. The chatbot will not answer questions where answered can't be found on the website. If switching topic, please click the 'New topic' button as the bot will assume follow up questions are linked to the first. Sources are shown underneath the chat area.\n\nWarnings: Please ensure that the document is not sensitive is any way as other users may see it!\n\nPlease note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
|
105 |
|
106 |
with gr.Tab("Chatbot"):
|
107 |
|
|
|
137 |
in_pdf = gr.File(label="Upload pdf", file_count="multiple", file_types=['.pdf'])
|
138 |
load_pdf = gr.Button(value="Load in file", variant="secondary", scale=0)
|
139 |
|
140 |
+
with gr.Accordion("Web page", open = False):
|
141 |
with gr.Row():
|
142 |
in_web = gr.Textbox(label="Enter webpage url")
|
143 |
in_div = gr.Textbox(label="(Advanced) Webpage div for text extraction", value="p", placeholder="p")
|
144 |
+
load_web = gr.Button(value="Load in webpage", variant="secondary", scale=0)
|
145 |
|
146 |
ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
|
147 |
|
chatfuncs/chatfuncs.py
CHANGED
@@ -395,13 +395,13 @@ def hybrid_retrieval(new_question_kworded, k_val, out_passages,
|
|
395 |
|
396 |
return docs_keep_as_doc, doc_df, docs_keep_out
|
397 |
|
398 |
-
def get_expanded_passages(vectorstore,
|
399 |
"""
|
400 |
Extracts expanded passages based on given documents and a width for context.
|
401 |
|
402 |
Parameters:
|
403 |
- vectorstore: The primary data source.
|
404 |
-
-
|
405 |
- width: Number of documents to expand around a given document for context.
|
406 |
|
407 |
Returns:
|
@@ -436,8 +436,8 @@ def get_expanded_passages(vectorstore, docs_keep_out, width):
|
|
436 |
for key in d1:
|
437 |
if key != "source":
|
438 |
merged[key] = str(d1[key]) + " to " + str(d2[key])
|
439 |
-
|
440 |
-
|
441 |
return merged
|
442 |
|
443 |
def merge_two_lists_of_dicts(list1, list2):
|
@@ -446,15 +446,22 @@ def get_expanded_passages(vectorstore, docs_keep_out, width):
|
|
446 |
vstore_docs = get_docs_from_vstore(vectorstore)
|
447 |
parent_vstore_meta_section = [doc.metadata['page_section'] for _, doc in vstore_docs]
|
448 |
|
|
|
|
|
449 |
expanded_docs = []
|
450 |
-
for doc, score in
|
451 |
search_section = doc.metadata['page_section']
|
452 |
search_index = parent_vstore_meta_section.index(search_section) if search_section in parent_vstore_meta_section else -1
|
453 |
|
454 |
content_str, meta_first, meta_last = get_parent_content_and_meta(vstore_docs, width, search_index)
|
|
|
|
|
|
|
|
|
|
|
455 |
meta_full = merge_two_lists_of_dicts(meta_first, meta_last)
|
456 |
|
457 |
-
print(meta_full)
|
458 |
|
459 |
expanded_doc = (Document(page_content=content_str[0], metadata=meta_full[0]), score)
|
460 |
expanded_docs.append(expanded_doc)
|
@@ -679,7 +686,7 @@ def highlight_found_text(search_text: str, full_text: str, hlt_chunk_size:int=hl
|
|
679 |
if sorted_starts:
|
680 |
current_start, current_end = sorted_starts[0], found_positions[sorted_starts[0]]
|
681 |
for start in sorted_starts[1:]:
|
682 |
-
if start <= (current_end +
|
683 |
current_end = max(current_end, found_positions[start])
|
684 |
else:
|
685 |
combined_positions.append((current_start, current_end))
|
|
|
395 |
|
396 |
return docs_keep_as_doc, doc_df, docs_keep_out
|
397 |
|
398 |
+
def get_expanded_passages(vectorstore, docs, width):
|
399 |
"""
|
400 |
Extracts expanded passages based on given documents and a width for context.
|
401 |
|
402 |
Parameters:
|
403 |
- vectorstore: The primary data source.
|
404 |
+
- docs: List of documents to be expanded.
|
405 |
- width: Number of documents to expand around a given document for context.
|
406 |
|
407 |
Returns:
|
|
|
436 |
for key in d1:
|
437 |
if key != "source":
|
438 |
merged[key] = str(d1[key]) + " to " + str(d2[key])
|
439 |
+
else:
|
440 |
+
merged[key] = d1[key] # or d2[key], based on preference
|
441 |
return merged
|
442 |
|
443 |
def merge_two_lists_of_dicts(list1, list2):
|
|
|
446 |
vstore_docs = get_docs_from_vstore(vectorstore)
|
447 |
parent_vstore_meta_section = [doc.metadata['page_section'] for _, doc in vstore_docs]
|
448 |
|
449 |
+
print(docs)
|
450 |
+
|
451 |
expanded_docs = []
|
452 |
+
for doc, score in docs:
|
453 |
search_section = doc.metadata['page_section']
|
454 |
search_index = parent_vstore_meta_section.index(search_section) if search_section in parent_vstore_meta_section else -1
|
455 |
|
456 |
content_str, meta_first, meta_last = get_parent_content_and_meta(vstore_docs, width, search_index)
|
457 |
+
print("Meta first:")
|
458 |
+
print(meta_first)
|
459 |
+
print("Meta last:")
|
460 |
+
print(meta_last)
|
461 |
+
print("Meta last end.")
|
462 |
meta_full = merge_two_lists_of_dicts(meta_first, meta_last)
|
463 |
|
464 |
+
#print(meta_full)
|
465 |
|
466 |
expanded_doc = (Document(page_content=content_str[0], metadata=meta_full[0]), score)
|
467 |
expanded_docs.append(expanded_doc)
|
|
|
686 |
if sorted_starts:
|
687 |
current_start, current_end = sorted_starts[0], found_positions[sorted_starts[0]]
|
688 |
for start in sorted_starts[1:]:
|
689 |
+
if start <= (current_end + 10):
|
690 |
current_end = max(current_end, found_positions[start])
|
691 |
else:
|
692 |
combined_positions.append((current_start, current_end))
|
chatfuncs/ingest_borough_plan.py
CHANGED
@@ -7,7 +7,7 @@ print("Borough plan text created")
|
|
7 |
|
8 |
#print(borough_plan_text)
|
9 |
|
10 |
-
borough_plan_docs
|
11 |
print("Borough plan docs created")
|
12 |
|
13 |
embedding_model = "thenlper/gte-base"
|
|
|
7 |
|
8 |
#print(borough_plan_text)
|
9 |
|
10 |
+
borough_plan_docs = ing.text_to_docs(borough_plan_text)
|
11 |
print("Borough plan docs created")
|
12 |
|
13 |
embedding_model = "thenlper/gte-base"
|