Sean-Case
commited on
Commit
•
41ed1b7
1
Parent(s):
1ae7b34
Added to intro, temp disabled web page load, address pdf load issue
Browse files- .gitignore +2 -1
- app.py +7 -5
- chatfuncs/ingest.py +11 -6
- chatfuncs/ingest_borough_plan.py +4 -4
.gitignore
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
*.pyc
|
2 |
-
*.ipynb
|
|
|
|
1 |
*.pyc
|
2 |
+
*.ipynb
|
3 |
+
*.pdf
|
app.py
CHANGED
@@ -63,6 +63,8 @@ def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings=embeddings):
|
|
63 |
|
64 |
print(f"> Total split documents: {len(docs_out)}")
|
65 |
|
|
|
|
|
66 |
vectorstore_func = FAISS.from_documents(documents=docs_out, embedding=embeddings)
|
67 |
|
68 |
'''
|
@@ -99,7 +101,7 @@ with block:
|
|
99 |
#with gr.Row():
|
100 |
gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
|
101 |
|
102 |
-
gr.Markdown("By default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select below. The chatbot will not answer questions where answered can't be found on the website.\n\nIf switching topic, please click the 'New topic' button as the bot will assume follow up questions are linked to the first. Sources are shown underneath the chat area.")
|
103 |
|
104 |
with gr.Tab("Chatbot"):
|
105 |
|
@@ -135,11 +137,11 @@ with block:
|
|
135 |
in_pdf = gr.File(label="Upload pdf", file_count="multiple", file_types=['.pdf'])
|
136 |
load_pdf = gr.Button(value="Load in file", variant="secondary", scale=0)
|
137 |
|
138 |
-
with gr.Accordion("Web page", open = False):
|
139 |
with gr.Row():
|
140 |
in_web = gr.Textbox(label="Enter webpage url")
|
141 |
in_div = gr.Textbox(label="(Advanced) Webpage div for text extraction", value="p", placeholder="p")
|
142 |
-
load_web = gr.Button(value="Load in webpage", variant="secondary", scale=0)
|
143 |
|
144 |
ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
|
145 |
|
@@ -151,7 +153,7 @@ with block:
|
|
151 |
ingest_metadata = gr.State()
|
152 |
ingest_docs = gr.State()
|
153 |
|
154 |
-
|
155 |
vectorstore_state = gr.State()
|
156 |
|
157 |
chat_history_state = gr.State()
|
@@ -163,7 +165,7 @@ with block:
|
|
163 |
# Load in a pdf
|
164 |
load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text]).\
|
165 |
then(ing.text_to_docs, inputs=[ingest_text], outputs=[ingest_docs]).\
|
166 |
-
then(docs_to_faiss_save, inputs=[ingest_docs], outputs=ingest_embed_out)
|
167 |
#then(hide_examples)
|
168 |
|
169 |
# Load in a webpage
|
|
|
63 |
|
64 |
print(f"> Total split documents: {len(docs_out)}")
|
65 |
|
66 |
+
print(docs_out)
|
67 |
+
|
68 |
vectorstore_func = FAISS.from_documents(documents=docs_out, embedding=embeddings)
|
69 |
|
70 |
'''
|
|
|
101 |
#with gr.Row():
|
102 |
gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
|
103 |
|
104 |
+
gr.Markdown("Chat with a document (alpha). By default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page (feature temporarily disabled), please select below. The chatbot will not answer questions where answered can't be found on the website.\n\nIf switching topic, please click the 'New topic' button as the bot will assume follow up questions are linked to the first. Sources are shown underneath the chat area.\n\nPlease note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
|
105 |
|
106 |
with gr.Tab("Chatbot"):
|
107 |
|
|
|
137 |
in_pdf = gr.File(label="Upload pdf", file_count="multiple", file_types=['.pdf'])
|
138 |
load_pdf = gr.Button(value="Load in file", variant="secondary", scale=0)
|
139 |
|
140 |
+
with gr.Accordion("Web page - Temporarily disabled", open = False):
|
141 |
with gr.Row():
|
142 |
in_web = gr.Textbox(label="Enter webpage url")
|
143 |
in_div = gr.Textbox(label="(Advanced) Webpage div for text extraction", value="p", placeholder="p")
|
144 |
+
load_web = gr.Button(value="Load in webpage", variant="secondary", scale=0, visible=False)
|
145 |
|
146 |
ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
|
147 |
|
|
|
153 |
ingest_metadata = gr.State()
|
154 |
ingest_docs = gr.State()
|
155 |
|
156 |
+
embeddings_state = gr.State()
|
157 |
vectorstore_state = gr.State()
|
158 |
|
159 |
chat_history_state = gr.State()
|
|
|
165 |
# Load in a pdf
|
166 |
load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text]).\
|
167 |
then(ing.text_to_docs, inputs=[ingest_text], outputs=[ingest_docs]).\
|
168 |
+
then(docs_to_faiss_save, inputs=[ingest_docs], outputs=ingest_embed_out) # #then(load_embeddings, outputs=[embeddings_state]).\
|
169 |
#then(hide_examples)
|
170 |
|
171 |
# Load in a webpage
|
chatfuncs/ingest.py
CHANGED
@@ -272,6 +272,8 @@ def parse_html(page_url, div_filter="p"):
|
|
272 |
texts.append(clean_text)
|
273 |
metadatas.append({"source": page_url, "date":str(date)})
|
274 |
|
|
|
|
|
275 |
return texts, metadatas
|
276 |
|
277 |
# +
|
@@ -300,9 +302,10 @@ def text_to_docs(text_dict: dict, chunk_size: int = chunk_size) -> List[Document
|
|
300 |
print(f"Unsupported file type {ext} for {file_path}. Skipping.")
|
301 |
continue
|
302 |
|
303 |
-
|
304 |
-
match = re.search(r'.*[\/\\](.+)$', file_path)
|
305 |
-
|
|
|
306 |
|
307 |
# Add filename as metadata
|
308 |
for doc in docs: doc.metadata["source"] = filename_end
|
@@ -311,7 +314,7 @@ def text_to_docs(text_dict: dict, chunk_size: int = chunk_size) -> List[Document
|
|
311 |
doc_sections.extend(docs)
|
312 |
#parent_doc_sections.extend(parent_docs)
|
313 |
|
314 |
-
return doc_sections
|
315 |
|
316 |
def pdf_text_to_docs(text, chunk_size: int = chunk_size) -> List[Document]:
|
317 |
"""Converts a string or list of strings to a list of Documents
|
@@ -378,7 +381,9 @@ def html_text_to_docs(texts, metadatas, chunk_size:int = chunk_size):
|
|
378 |
documents = text_splitter.create_documents(texts, metadatas=metadatas)
|
379 |
|
380 |
for i, section in enumerate(documents):
|
381 |
-
section.metadata["
|
|
|
|
|
382 |
|
383 |
return documents
|
384 |
|
@@ -456,7 +461,7 @@ def load_embeddings(model_name = "thenlper/gte-base"):
|
|
456 |
|
457 |
embeddings = embeddings_func
|
458 |
|
459 |
-
|
460 |
|
461 |
def embed_faiss_save_to_zip(docs_out, save_to="faiss_lambeth_census_embedding", model_name = "thenlper/gte-base"):
|
462 |
|
|
|
272 |
texts.append(clean_text)
|
273 |
metadatas.append({"source": page_url, "date":str(date)})
|
274 |
|
275 |
+
print(metadatas)
|
276 |
+
|
277 |
return texts, metadatas
|
278 |
|
279 |
# +
|
|
|
302 |
print(f"Unsupported file type {ext} for {file_path}. Skipping.")
|
303 |
continue
|
304 |
|
305 |
+
#match = re.search(r'.*[\/\\](.+)$', file_path)
|
306 |
+
match = re.search(r'(.*[\/\\])?(.+)$', file_path)
|
307 |
+
|
308 |
+
filename_end = match.group(2) if match else ''
|
309 |
|
310 |
# Add filename as metadata
|
311 |
for doc in docs: doc.metadata["source"] = filename_end
|
|
|
314 |
doc_sections.extend(docs)
|
315 |
#parent_doc_sections.extend(parent_docs)
|
316 |
|
317 |
+
return doc_sections#, page_docs
|
318 |
|
319 |
def pdf_text_to_docs(text, chunk_size: int = chunk_size) -> List[Document]:
|
320 |
"""Converts a string or list of strings to a list of Documents
|
|
|
381 |
documents = text_splitter.create_documents(texts, metadatas=metadatas)
|
382 |
|
383 |
for i, section in enumerate(documents):
|
384 |
+
section.metadata["page_section"] = i + 1
|
385 |
+
|
386 |
+
|
387 |
|
388 |
return documents
|
389 |
|
|
|
461 |
|
462 |
embeddings = embeddings_func
|
463 |
|
464 |
+
return embeddings_func
|
465 |
|
466 |
def embed_faiss_save_to_zip(docs_out, save_to="faiss_lambeth_census_embedding", model_name = "thenlper/gte-base"):
|
467 |
|
chatfuncs/ingest_borough_plan.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
import ingest as ing
|
2 |
import pandas as pd
|
3 |
|
4 |
-
|
|
|
5 |
print("Borough plan text created")
|
6 |
|
7 |
#print(borough_plan_text)
|
@@ -11,6 +12,5 @@ print("Borough plan docs created")
|
|
11 |
|
12 |
embedding_model = "thenlper/gte-base"
|
13 |
|
14 |
-
ing.load_embeddings(model_name = embedding_model)
|
15 |
-
ing.embed_faiss_save_to_zip(borough_plan_docs, save_to="faiss_embedding", model_name = embedding_model)
|
16 |
-
#borough_plan_parent_docs.to_csv("borough_plan_parent_docs.csv", index=False)
|
|
|
1 |
import ingest as ing
|
2 |
import pandas as pd
|
3 |
|
4 |
+
|
5 |
+
borough_plan_text = ing.parse_file([open("Lambeth_2030-Our_Future_Our_Lambeth.pdf")])
|
6 |
print("Borough plan text created")
|
7 |
|
8 |
#print(borough_plan_text)
|
|
|
12 |
|
13 |
embedding_model = "thenlper/gte-base"
|
14 |
|
15 |
+
embeddings = ing.load_embeddings(model_name = embedding_model)
|
16 |
+
ing.embed_faiss_save_to_zip(borough_plan_docs, save_to="faiss_embedding", model_name = embedding_model)
|
|