Spaces:
Runtime error
Runtime error
Sean-Case
commited on
Commit
•
9118536
1
Parent(s):
ae4a7ec
Added reference to source on app. Minor text changes.
Browse files- app.py +6 -4
- chatfuncs/chatfuncs.py +6 -6
- chatfuncs/ingest.py +20 -7
app.py
CHANGED
@@ -101,7 +101,9 @@ with block:
|
|
101 |
#with gr.Row():
|
102 |
gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
|
103 |
|
104 |
-
gr.Markdown("Chat with a document (alpha). By default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page (feature temporarily disabled), please select below. The chatbot will not answer questions where answered can't be found on the website. If switching topic, please click the 'New topic' button as the bot will assume follow up questions are linked to the first. Sources are shown underneath the chat area.\n\nWarnings: Please ensure that the document is not sensitive is any way as other users may see it
|
|
|
|
|
105 |
|
106 |
with gr.Tab("Chatbot"):
|
107 |
|
@@ -128,7 +130,7 @@ with block:
|
|
128 |
)
|
129 |
|
130 |
with gr.Row():
|
131 |
-
current_topic = gr.Textbox(label="
|
132 |
clear = gr.Button(value="New topic", variant="secondary", scale=0)
|
133 |
|
134 |
|
@@ -163,13 +165,13 @@ with block:
|
|
163 |
# return gr.Examples.update(visible=False)
|
164 |
|
165 |
# Load in a pdf
|
166 |
-
load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text]).\
|
167 |
then(ing.text_to_docs, inputs=[ingest_text], outputs=[ingest_docs]).\
|
168 |
then(docs_to_faiss_save, inputs=[ingest_docs], outputs=ingest_embed_out) # #then(load_embeddings, outputs=[embeddings_state]).\
|
169 |
#then(hide_examples)
|
170 |
|
171 |
# Load in a webpage
|
172 |
-
load_web_click = load_web.click(ing.parse_html, inputs=[in_web, in_div], outputs=[ingest_text, ingest_metadata]).\
|
173 |
then(ing.html_text_to_docs, inputs=[ingest_text, ingest_metadata], outputs=[ingest_docs]).\
|
174 |
then(docs_to_faiss_save, inputs=[ingest_docs], outputs=ingest_embed_out)
|
175 |
#then(hide_examples)
|
|
|
101 |
#with gr.Row():
|
102 |
gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
|
103 |
|
104 |
+
gr.Markdown("Chat with a document (alpha). By default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page (feature temporarily disabled), please select below. The chatbot will not answer questions where answered can't be found on the website. If switching topic, please click the 'New topic' button as the bot will assume follow up questions are linked to the first. Sources are shown underneath the chat area.\n\nWarnings: This is a public app. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
|
105 |
+
|
106 |
+
current_source = gr.Textbox(label="Current data source that is loaded into the app", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf")
|
107 |
|
108 |
with gr.Tab("Chatbot"):
|
109 |
|
|
|
130 |
)
|
131 |
|
132 |
with gr.Row():
|
133 |
+
current_topic = gr.Textbox(label="Keywords related to current conversation topic. If you want to talk about something else, press 'New topic'", placeholder="Keywords related to the conversation topic will appear here")
|
134 |
clear = gr.Button(value="New topic", variant="secondary", scale=0)
|
135 |
|
136 |
|
|
|
165 |
# return gr.Examples.update(visible=False)
|
166 |
|
167 |
# Load in a pdf
|
168 |
+
load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text, current_source]).\
|
169 |
then(ing.text_to_docs, inputs=[ingest_text], outputs=[ingest_docs]).\
|
170 |
then(docs_to_faiss_save, inputs=[ingest_docs], outputs=ingest_embed_out) # #then(load_embeddings, outputs=[embeddings_state]).\
|
171 |
#then(hide_examples)
|
172 |
|
173 |
# Load in a webpage
|
174 |
+
load_web_click = load_web.click(ing.parse_html, inputs=[in_web, in_div], outputs=[ingest_text, ingest_metadata, current_source]).\
|
175 |
then(ing.html_text_to_docs, inputs=[ingest_text, ingest_metadata], outputs=[ingest_docs]).\
|
176 |
then(docs_to_faiss_save, inputs=[ingest_docs], outputs=ingest_embed_out)
|
177 |
#then(hide_examples)
|
chatfuncs/chatfuncs.py
CHANGED
@@ -446,7 +446,7 @@ def get_expanded_passages(vectorstore, docs, width):
|
|
446 |
vstore_docs = get_docs_from_vstore(vectorstore)
|
447 |
parent_vstore_meta_section = [doc.metadata['page_section'] for _, doc in vstore_docs]
|
448 |
|
449 |
-
print(docs)
|
450 |
|
451 |
expanded_docs = []
|
452 |
for doc, score in docs:
|
@@ -454,11 +454,11 @@ def get_expanded_passages(vectorstore, docs, width):
|
|
454 |
search_index = parent_vstore_meta_section.index(search_section) if search_section in parent_vstore_meta_section else -1
|
455 |
|
456 |
content_str, meta_first, meta_last = get_parent_content_and_meta(vstore_docs, width, search_index)
|
457 |
-
print("Meta first:")
|
458 |
-
print(meta_first)
|
459 |
-
print("Meta last:")
|
460 |
-
print(meta_last)
|
461 |
-
print("Meta last end.")
|
462 |
meta_full = merge_two_lists_of_dicts(meta_first, meta_last)
|
463 |
|
464 |
#print(meta_full)
|
|
|
446 |
vstore_docs = get_docs_from_vstore(vectorstore)
|
447 |
parent_vstore_meta_section = [doc.metadata['page_section'] for _, doc in vstore_docs]
|
448 |
|
449 |
+
#print(docs)
|
450 |
|
451 |
expanded_docs = []
|
452 |
for doc, score in docs:
|
|
|
454 |
search_index = parent_vstore_meta_section.index(search_section) if search_section in parent_vstore_meta_section else -1
|
455 |
|
456 |
content_str, meta_first, meta_last = get_parent_content_and_meta(vstore_docs, width, search_index)
|
457 |
+
#print("Meta first:")
|
458 |
+
#print(meta_first)
|
459 |
+
#print("Meta last:")
|
460 |
+
#print(meta_last)
|
461 |
+
#print("Meta last end.")
|
462 |
meta_full = merge_two_lists_of_dicts(meta_first, meta_last)
|
463 |
|
464 |
#print(meta_full)
|
chatfuncs/ingest.py
CHANGED
@@ -45,7 +45,7 @@ start_index = True
|
|
45 |
|
46 |
## Parse files
|
47 |
|
48 |
-
def parse_file(file_paths
|
49 |
"""
|
50 |
Accepts a list of file paths, determines each file's type based on its extension,
|
51 |
and passes it to the relevant parsing function.
|
@@ -82,6 +82,7 @@ def parse_file(file_paths, div:str = "p"):
|
|
82 |
}
|
83 |
|
84 |
parsed_contents = {}
|
|
|
85 |
|
86 |
for file_path in file_paths:
|
87 |
print(file_path.name)
|
@@ -92,8 +93,12 @@ def parse_file(file_paths, div:str = "p"):
|
|
92 |
parsed_contents[file_path.name] = extension_to_parser[file_extension](file_path.name)
|
93 |
else:
|
94 |
parsed_contents[file_path.name] = f"Unsupported file type: {file_extension}"
|
|
|
|
|
|
|
|
|
95 |
|
96 |
-
return parsed_contents
|
97 |
|
98 |
def text_regex_clean(text):
|
99 |
# Merge hyphenated words
|
@@ -272,9 +277,16 @@ def parse_html(page_url, div_filter="p"):
|
|
272 |
texts.append(clean_text)
|
273 |
metadatas.append({"source": page_url, "date":str(date)})
|
274 |
|
275 |
-
print(metadatas)
|
276 |
|
277 |
-
return texts, metadatas
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
278 |
|
279 |
# +
|
280 |
# Convert parsed text to docs
|
@@ -302,10 +314,11 @@ def text_to_docs(text_dict: dict, chunk_size: int = chunk_size) -> List[Document
|
|
302 |
print(f"Unsupported file type {ext} for {file_path}. Skipping.")
|
303 |
continue
|
304 |
|
305 |
-
#match = re.search(r'.*[\/\\](.+)$', file_path)
|
306 |
-
match = re.search(r'(.*[\/\\])?(.+)$', file_path)
|
307 |
|
308 |
-
filename_end =
|
|
|
|
|
|
|
309 |
|
310 |
# Add filename as metadata
|
311 |
for doc in docs: doc.metadata["source"] = filename_end
|
|
|
45 |
|
46 |
## Parse files
|
47 |
|
48 |
+
def parse_file(file_paths):
|
49 |
"""
|
50 |
Accepts a list of file paths, determines each file's type based on its extension,
|
51 |
and passes it to the relevant parsing function.
|
|
|
82 |
}
|
83 |
|
84 |
parsed_contents = {}
|
85 |
+
file_names = []
|
86 |
|
87 |
for file_path in file_paths:
|
88 |
print(file_path.name)
|
|
|
93 |
parsed_contents[file_path.name] = extension_to_parser[file_extension](file_path.name)
|
94 |
else:
|
95 |
parsed_contents[file_path.name] = f"Unsupported file type: {file_extension}"
|
96 |
+
|
97 |
+
filename_end = get_file_path_end(file_path.name)
|
98 |
+
|
99 |
+
file_names.append(filename_end)
|
100 |
|
101 |
+
return parsed_contents, file_names
|
102 |
|
103 |
def text_regex_clean(text):
|
104 |
# Merge hyphenated words
|
|
|
277 |
texts.append(clean_text)
|
278 |
metadatas.append({"source": page_url, "date":str(date)})
|
279 |
|
280 |
+
#print(metadatas)
|
281 |
|
282 |
+
return texts, metadatas, page_url
|
283 |
+
|
284 |
+
def get_file_path_end(file_path):
|
285 |
+
match = re.search(r'(.*[\/\\])?(.+)$', file_path)
|
286 |
+
|
287 |
+
filename_end = match.group(2) if match else ''
|
288 |
+
|
289 |
+
return filename_end
|
290 |
|
291 |
# +
|
292 |
# Convert parsed text to docs
|
|
|
314 |
print(f"Unsupported file type {ext} for {file_path}. Skipping.")
|
315 |
continue
|
316 |
|
|
|
|
|
317 |
|
318 |
+
filename_end = get_file_path_end(file_path)
|
319 |
+
|
320 |
+
#match = re.search(r'(.*[\/\\])?(.+)$', file_path)
|
321 |
+
#filename_end = match.group(2) if match else ''
|
322 |
|
323 |
# Add filename as metadata
|
324 |
for doc in docs: doc.metadata["source"] = filename_end
|