Sean-Case commited on
Commit
41ed1b7
·
1 Parent(s): 1ae7b34

Added to intro, temp disabled web page load, address pdf load issue

Browse files
Files changed (4) hide show
  1. .gitignore +2 -1
  2. app.py +7 -5
  3. chatfuncs/ingest.py +11 -6
  4. chatfuncs/ingest_borough_plan.py +4 -4
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  *.pyc
2
- *.ipynb
 
 
1
  *.pyc
2
+ *.ipynb
3
+ *.pdf
app.py CHANGED
@@ -63,6 +63,8 @@ def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings=embeddings):
63
 
64
  print(f"> Total split documents: {len(docs_out)}")
65
 
 
 
66
  vectorstore_func = FAISS.from_documents(documents=docs_out, embedding=embeddings)
67
 
68
  '''
@@ -99,7 +101,7 @@ with block:
99
  #with gr.Row():
100
  gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
101
 
102
- gr.Markdown("By default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select below. The chatbot will not answer questions where answered can't be found on the website.\n\nIf switching topic, please click the 'New topic' button as the bot will assume follow up questions are linked to the first. Sources are shown underneath the chat area.")
103
 
104
  with gr.Tab("Chatbot"):
105
 
@@ -135,11 +137,11 @@ with block:
135
  in_pdf = gr.File(label="Upload pdf", file_count="multiple", file_types=['.pdf'])
136
  load_pdf = gr.Button(value="Load in file", variant="secondary", scale=0)
137
 
138
- with gr.Accordion("Web page", open = False):
139
  with gr.Row():
140
  in_web = gr.Textbox(label="Enter webpage url")
141
  in_div = gr.Textbox(label="(Advanced) Webpage div for text extraction", value="p", placeholder="p")
142
- load_web = gr.Button(value="Load in webpage", variant="secondary", scale=0)
143
 
144
  ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
145
 
@@ -151,7 +153,7 @@ with block:
151
  ingest_metadata = gr.State()
152
  ingest_docs = gr.State()
153
 
154
- #embeddings_state = gr.State()
155
  vectorstore_state = gr.State()
156
 
157
  chat_history_state = gr.State()
@@ -163,7 +165,7 @@ with block:
163
  # Load in a pdf
164
  load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text]).\
165
  then(ing.text_to_docs, inputs=[ingest_text], outputs=[ingest_docs]).\
166
- then(docs_to_faiss_save, inputs=[ingest_docs], outputs=ingest_embed_out)
167
  #then(hide_examples)
168
 
169
  # Load in a webpage
 
63
 
64
  print(f"> Total split documents: {len(docs_out)}")
65
 
66
+ print(docs_out)
67
+
68
  vectorstore_func = FAISS.from_documents(documents=docs_out, embedding=embeddings)
69
 
70
  '''
 
101
  #with gr.Row():
102
  gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
103
 
104
+ gr.Markdown("Chat with a document (alpha). By default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page (feature temporarily disabled), please select below. The chatbot will not answer questions where answered can't be found on the website.\n\nIf switching topic, please click the 'New topic' button as the bot will assume follow up questions are linked to the first. Sources are shown underneath the chat area.\n\nPlease note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
105
 
106
  with gr.Tab("Chatbot"):
107
 
 
137
  in_pdf = gr.File(label="Upload pdf", file_count="multiple", file_types=['.pdf'])
138
  load_pdf = gr.Button(value="Load in file", variant="secondary", scale=0)
139
 
140
+ with gr.Accordion("Web page - Temporarily disabled", open = False):
141
  with gr.Row():
142
  in_web = gr.Textbox(label="Enter webpage url")
143
  in_div = gr.Textbox(label="(Advanced) Webpage div for text extraction", value="p", placeholder="p")
144
+ load_web = gr.Button(value="Load in webpage", variant="secondary", scale=0, visible=False)
145
 
146
  ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
147
 
 
153
  ingest_metadata = gr.State()
154
  ingest_docs = gr.State()
155
 
156
+ embeddings_state = gr.State()
157
  vectorstore_state = gr.State()
158
 
159
  chat_history_state = gr.State()
 
165
  # Load in a pdf
166
  load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text]).\
167
  then(ing.text_to_docs, inputs=[ingest_text], outputs=[ingest_docs]).\
168
+ then(docs_to_faiss_save, inputs=[ingest_docs], outputs=ingest_embed_out) # #then(load_embeddings, outputs=[embeddings_state]).\
169
  #then(hide_examples)
170
 
171
  # Load in a webpage
chatfuncs/ingest.py CHANGED
@@ -272,6 +272,8 @@ def parse_html(page_url, div_filter="p"):
272
  texts.append(clean_text)
273
  metadatas.append({"source": page_url, "date":str(date)})
274
 
 
 
275
  return texts, metadatas
276
 
277
  # +
@@ -300,9 +302,10 @@ def text_to_docs(text_dict: dict, chunk_size: int = chunk_size) -> List[Document
300
  print(f"Unsupported file type {ext} for {file_path}. Skipping.")
301
  continue
302
 
303
-
304
- match = re.search(r'.*[\/\\](.+)$', file_path)
305
- filename_end = match.group(1)
 
306
 
307
  # Add filename as metadata
308
  for doc in docs: doc.metadata["source"] = filename_end
@@ -311,7 +314,7 @@ def text_to_docs(text_dict: dict, chunk_size: int = chunk_size) -> List[Document
311
  doc_sections.extend(docs)
312
  #parent_doc_sections.extend(parent_docs)
313
 
314
- return doc_sections, page_docs
315
 
316
  def pdf_text_to_docs(text, chunk_size: int = chunk_size) -> List[Document]:
317
  """Converts a string or list of strings to a list of Documents
@@ -378,7 +381,9 @@ def html_text_to_docs(texts, metadatas, chunk_size:int = chunk_size):
378
  documents = text_splitter.create_documents(texts, metadatas=metadatas)
379
 
380
  for i, section in enumerate(documents):
381
- section.metadata["section"] = i + 1
 
 
382
 
383
  return documents
384
 
@@ -456,7 +461,7 @@ def load_embeddings(model_name = "thenlper/gte-base"):
456
 
457
  embeddings = embeddings_func
458
 
459
- #return embeddings_func
460
 
461
  def embed_faiss_save_to_zip(docs_out, save_to="faiss_lambeth_census_embedding", model_name = "thenlper/gte-base"):
462
 
 
272
  texts.append(clean_text)
273
  metadatas.append({"source": page_url, "date":str(date)})
274
 
275
+ print(metadatas)
276
+
277
  return texts, metadatas
278
 
279
  # +
 
302
  print(f"Unsupported file type {ext} for {file_path}. Skipping.")
303
  continue
304
 
305
+ #match = re.search(r'.*[\/\\](.+)$', file_path)
306
+ match = re.search(r'(.*[\/\\])?(.+)$', file_path)
307
+
308
+ filename_end = match.group(2) if match else ''
309
 
310
  # Add filename as metadata
311
  for doc in docs: doc.metadata["source"] = filename_end
 
314
  doc_sections.extend(docs)
315
  #parent_doc_sections.extend(parent_docs)
316
 
317
+ return doc_sections#, page_docs
318
 
319
  def pdf_text_to_docs(text, chunk_size: int = chunk_size) -> List[Document]:
320
  """Converts a string or list of strings to a list of Documents
 
381
  documents = text_splitter.create_documents(texts, metadatas=metadatas)
382
 
383
  for i, section in enumerate(documents):
384
+ section.metadata["page_section"] = i + 1
385
+
386
+
387
 
388
  return documents
389
 
 
461
 
462
  embeddings = embeddings_func
463
 
464
+ return embeddings_func
465
 
466
  def embed_faiss_save_to_zip(docs_out, save_to="faiss_lambeth_census_embedding", model_name = "thenlper/gte-base"):
467
 
chatfuncs/ingest_borough_plan.py CHANGED
@@ -1,7 +1,8 @@
1
  import ingest as ing
2
  import pandas as pd
3
 
4
- borough_plan_text = ing.parse_file([open("faiss_embedding/Lambeth_2030-Our_Future_Our_Lambeth.pdf")])
 
5
  print("Borough plan text created")
6
 
7
  #print(borough_plan_text)
@@ -11,6 +12,5 @@ print("Borough plan docs created")
11
 
12
  embedding_model = "thenlper/gte-base"
13
 
14
- ing.load_embeddings(model_name = embedding_model)
15
- ing.embed_faiss_save_to_zip(borough_plan_docs, save_to="faiss_embedding", model_name = embedding_model)
16
- #borough_plan_parent_docs.to_csv("borough_plan_parent_docs.csv", index=False)
 
1
  import ingest as ing
2
  import pandas as pd
3
 
4
+
5
+ borough_plan_text = ing.parse_file([open("Lambeth_2030-Our_Future_Our_Lambeth.pdf")])
6
  print("Borough plan text created")
7
 
8
  #print(borough_plan_text)
 
12
 
13
  embedding_model = "thenlper/gte-base"
14
 
15
+ embeddings = ing.load_embeddings(model_name = embedding_model)
16
+ ing.embed_faiss_save_to_zip(borough_plan_docs, save_to="faiss_embedding", model_name = embedding_model)