Sean-Case commited on
Commit
49e32ea
1 Parent(s): db3d7b6
.github/workflows/check_file_size.yml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Check file size
2
+ on: # or directly `on: [push]` to run the action on every push on any branch
3
+ pull_request:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - name: Check large files
14
+ uses: ActionsDesk/lfs-warning@v2.0
15
+ with:
16
+ filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces
.github/workflows/sync_hf.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+ with:
15
+ fetch-depth: 0
16
+ lfs: true
17
+ - name: Push to hub
18
+ env:
19
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
20
+ run: git push https://seanpedrickcase:$HF_TOKEN@huggingface.co/spaces/seanpedrickcase/Light-PDF-Web-QA-Chatbot main
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.pyc
2
+ *.ipynb
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ WORKDIR /src
4
+
5
+ COPY requirements.txt .
6
+
7
+ RUN pip install --no-cache-dir -r requirements.txt
8
+
9
+ # Set up a new user named "user" with user ID 1000
10
+ RUN useradd -m -u 1000 user
11
+ # Switch to the "user" user
12
+ USER user
13
+ # Set home to the user's home directory
14
+ ENV HOME=/home/user \
15
+ PATH=/home/user/.local/bin:$PATH \
16
+ PYTHONPATH=$HOME/app \
17
+ PYTHONUNBUFFERED=1 \
18
+ GRADIO_ALLOW_FLAGGING=never \
19
+ GRADIO_NUM_PORTS=1 \
20
+ GRADIO_SERVER_NAME=0.0.0.0 \
21
+ GRADIO_THEME=huggingface \
22
+ SYSTEM=spaces
23
+
24
+ # Set the working directory to the user's home directory
25
+ WORKDIR $HOME/app
26
+
27
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
28
+ COPY --chown=user . $HOME/app
29
+
30
+ CMD ["python", "app.py"]
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Light PDF web QA chatbot
3
+ emoji: 📈
4
+ colorFrom: yellow
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 3.35.2
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ ---
12
+
13
+ Chat with a pdf file or web page using a light language model through a Gradio interface. Quick responses even just using CPU.
app.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # # Load in packages
2
+
3
+ # +
4
+ import os
5
+ from typing import TypeVar
6
+ from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
7
+ from langchain.vectorstores import FAISS
8
+
9
+
10
+ #PandasDataFrame: type[pd.core.frame.DataFrame]
11
+ PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
12
+
13
+ # Disable cuda devices if necessary
14
+
15
+ #os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
16
+
17
+ #from chatfuncs.chatfuncs import *
18
+ import chatfuncs.ingest as ing
19
+
20
+ ## Load preset embeddings and vectorstore
21
+
22
+ embeddings_name = "thenlper/gte-base"
23
+
24
+ def load_embeddings(embeddings_name = "thenlper/gte-base"):
25
+
26
+
27
+ if embeddings_name == "hkunlp/instructor-large":
28
+ embeddings_func = HuggingFaceInstructEmbeddings(model_name=embeddings_name,
29
+ embed_instruction="Represent the paragraph for retrieval: ",
30
+ query_instruction="Represent the question for retrieving supporting documents: "
31
+ )
32
+
33
+ else:
34
+ embeddings_func = HuggingFaceEmbeddings(model_name=embeddings_name)
35
+
36
+ global embeddings
37
+
38
+ embeddings = embeddings_func
39
+
40
+ return embeddings
41
+
42
+ def get_faiss_store(faiss_vstore_folder,embeddings):
43
+ import zipfile
44
+ with zipfile.ZipFile(faiss_vstore_folder + '/' + faiss_vstore_folder + '.zip', 'r') as zip_ref:
45
+ zip_ref.extractall(faiss_vstore_folder)
46
+
47
+ faiss_vstore = FAISS.load_local(folder_path=faiss_vstore_folder, embeddings=embeddings)
48
+ os.remove(faiss_vstore_folder + "/index.faiss")
49
+ os.remove(faiss_vstore_folder + "/index.pkl")
50
+
51
+ global vectorstore
52
+
53
+ vectorstore = faiss_vstore
54
+
55
+ return vectorstore
56
+
57
+ import chatfuncs.chatfuncs as chatf
58
+
59
+ chatf.embeddings = load_embeddings(embeddings_name)
60
+ chatf.vectorstore = get_faiss_store(faiss_vstore_folder="faiss_embedding",embeddings=globals()["embeddings"])
61
+
62
+ def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings=embeddings):
63
+
64
+ print(f"> Total split documents: {len(docs_out)}")
65
+
66
+ vectorstore_func = FAISS.from_documents(documents=docs_out, embedding=embeddings)
67
+
68
+ '''
69
+ #with open("vectorstore.pkl", "wb") as f:
70
+ #pickle.dump(vectorstore, f)
71
+ '''
72
+
73
+ #if Path(save_to).exists():
74
+ # vectorstore_func.save_local(folder_path=save_to)
75
+ #else:
76
+ # os.mkdir(save_to)
77
+ # vectorstore_func.save_local(folder_path=save_to)
78
+
79
+ global vectorstore
80
+
81
+ vectorstore = vectorstore_func
82
+
83
+ chatf.vectorstore = vectorstore
84
+
85
+ out_message = "Document processing complete"
86
+
87
+ #print(out_message)
88
+ #print(f"> Saved to: {save_to}")
89
+
90
+ return out_message
91
+
92
+ # Gradio chat
93
+
94
+ import gradio as gr
95
+
96
+ block = gr.Blocks(css=".gradio-container {background-color: black}")
97
+
98
+ with block:
99
+ #with gr.Row():
100
+ gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
101
+
102
+ gr.Markdown("By default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select below. The chatbot will not answer questions where answered can't be found on the website.\n\nIf switching topic, please click the 'New topic' button as the bot will assume follow up questions are linked to the first. Sources are shown underneath the chat area.")
103
+
104
+ with gr.Tab("Chatbot"):
105
+
106
+ with gr.Row():
107
+ chatbot = gr.Chatbot(height=300)
108
+ sources = gr.HTML(value = "Source paragraphs where I looked for answers will appear here", height=300)
109
+
110
+ with gr.Row():
111
+ message = gr.Textbox(
112
+ label="What's your question?",
113
+ lines=1,
114
+ )
115
+
116
+
117
+ submit = gr.Button(value="Send message", variant="secondary", scale = 1)
118
+
119
+ examples_set = gr.Examples(label="Examples for the Lambeth Borough Plan",
120
+ examples=[
121
+ "What were the five pillars of the previous borough plan?",
122
+ "What is the vision statement for Lambeth?",
123
+ "What are the commitments for Lambeth?",
124
+ "What are the 2030 outcomes for Lambeth?"],
125
+ inputs=message,
126
+ )
127
+
128
+ with gr.Row():
129
+ current_topic = gr.Textbox(label="Current conversation topic. If you want to talk about something else, press 'New topic'", placeholder="Keywords related to the conversation topic will appear here")
130
+ clear = gr.Button(value="New topic", variant="secondary", scale=0)
131
+
132
+
133
+ with gr.Tab("Load in a different PDF file or web page to chat"):
134
+ with gr.Accordion("PDF file", open = False):
135
+ in_pdf = gr.File(label="Upload pdf", file_count="multiple", file_types=['.pdf'])
136
+ load_pdf = gr.Button(value="Load in file", variant="secondary", scale=0)
137
+
138
+ with gr.Accordion("Web page", open = False):
139
+ with gr.Row():
140
+ in_web = gr.Textbox(label="Enter webpage url")
141
+ in_div = gr.Textbox(label="(Advanced) Webpage div for text extraction", value="p", placeholder="p")
142
+ load_web = gr.Button(value="Load in webpage", variant="secondary", scale=0)
143
+
144
+ ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
145
+
146
+ gr.HTML(
147
+ "<center>Powered by Flan Alpaca and Langchain</a></center>"
148
+ )
149
+
150
+ ingest_text = gr.State()
151
+ ingest_metadata = gr.State()
152
+ ingest_docs = gr.State()
153
+
154
+ #embeddings_state = gr.State()
155
+ vectorstore_state = gr.State()
156
+
157
+ chat_history_state = gr.State()
158
+ instruction_prompt_out = gr.State()
159
+
160
+ #def hide_examples():
161
+ # return gr.Examples.update(visible=False)
162
+
163
+ # Load in a pdf
164
+ load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text]).\
165
+ then(ing.text_to_docs, inputs=[ingest_text], outputs=[ingest_docs]).\
166
+ then(docs_to_faiss_save, inputs=[ingest_docs], outputs=ingest_embed_out)
167
+ #then(hide_examples)
168
+
169
+ # Load in a webpage
170
+ load_web_click = load_web.click(ing.parse_html, inputs=[in_web, in_div], outputs=[ingest_text, ingest_metadata]).\
171
+ then(ing.html_text_to_docs, inputs=[ingest_text, ingest_metadata], outputs=[ingest_docs]).\
172
+ then(docs_to_faiss_save, inputs=[ingest_docs], outputs=ingest_embed_out)
173
+ #then(hide_examples)
174
+
175
+ # Load in a webpage
176
+
177
+ # Click/enter to send message action
178
+ response_click = submit.click(chatf.get_history_sources_final_input_prompt, inputs=[message, chat_history_state, current_topic], outputs=[chat_history_state, sources, instruction_prompt_out], queue=False).\
179
+ then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
180
+ then(chatf.produce_streaming_answer_chatbot_hf, inputs=[chatbot, instruction_prompt_out], outputs=chatbot)
181
+ response_click.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
182
+ then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
183
+ then(lambda: gr.update(interactive=True), None, [message], queue=False)
184
+
185
+ response_enter = message.submit(chatf.get_history_sources_final_input_prompt, inputs=[message, chat_history_state, current_topic], outputs=[chat_history_state, sources, instruction_prompt_out], queue=False).\
186
+ then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
187
+ then(chatf.produce_streaming_answer_chatbot_hf, [chatbot, instruction_prompt_out], chatbot)
188
+ response_enter.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
189
+ then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
190
+ then(lambda: gr.update(interactive=True), None, [message], queue=False)
191
+
192
+ # Clear box
193
+ clear.click(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic])
194
+ clear.click(lambda: None, None, chatbot, queue=False)
195
+
196
+ block.queue(concurrency_count=1).launch(debug=True)
197
+ # -
198
+
chatfuncs/.ipynb_checkpoints/chatfuncs-checkpoint.py ADDED
@@ -0,0 +1,553 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ---
2
+ # jupyter:
3
+ # jupytext:
4
+ # formats: ipynb,py:light
5
+ # text_representation:
6
+ # extension: .py
7
+ # format_name: light
8
+ # format_version: '1.5'
9
+ # jupytext_version: 1.14.6
10
+ # kernelspec:
11
+ # display_name: Python 3 (ipykernel)
12
+ # language: python
13
+ # name: python3
14
+ # ---
15
+
16
+ # +
17
+ import os
18
+ import datetime
19
+ from typing import Dict, List, Tuple
20
+ from itertools import compress
21
+ import pandas as pd
22
+
23
+ from langchain import PromptTemplate
24
+ from langchain.chains import LLMChain
25
+ from langchain.chains.base import Chain
26
+ from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
27
+ from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
28
+ from langchain.chains.qa_with_sources import load_qa_with_sources_chain
29
+ from langchain.prompts import PromptTemplate
30
+ from langchain.retrievers import TFIDFRetriever, SVMRetriever
31
+ from langchain.vectorstores import FAISS
32
+ from langchain.llms import HuggingFacePipeline
33
+
34
+ from pydantic import BaseModel
35
+
36
+ import nltk
37
+ from nltk.corpus import stopwords
38
+ from nltk.tokenize import word_tokenize
39
+
40
+ import torch
41
+ #from transformers import pipeline
42
+ from optimum.pipelines import pipeline
43
+ from transformers import AutoTokenizer, TextStreamer, AutoModelForSeq2SeqLM, TextIteratorStreamer
44
+ from threading import Thread
45
+
46
+ import gradio as gr
47
+
48
+
49
+ # -
50
+
51
+ # # Pre-load stopwords, vectorstore, models
52
+
53
+ # +
54
+ def get_faiss_store(faiss_vstore_folder,embeddings):
55
+ import zipfile
56
+ with zipfile.ZipFile(faiss_vstore_folder + '/faiss_lambeth_census_embedding.zip', 'r') as zip_ref:
57
+ zip_ref.extractall(faiss_vstore_folder)
58
+
59
+ faiss_vstore = FAISS.load_local(folder_path=faiss_vstore_folder, embeddings=embeddings)
60
+ os.remove(faiss_vstore_folder + "/index.faiss")
61
+ os.remove(faiss_vstore_folder + "/index.pkl")
62
+
63
+ return faiss_vstore
64
+
65
+ #def set_hf_api_key(api_key, chain_agent):
66
+ #if api_key:
67
+ #os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_key
68
+ #vectorstore = get_faiss_store(faiss_vstore_folder="faiss_lambeth_census_embedding.zip",embeddings=embeddings)
69
+ #qa_chain = create_prompt_templates(vectorstore)
70
+ #print(qa_chain)
71
+ #os.environ["HUGGINGFACEHUB_API_TOKEN"] = ""
72
+ #return qa_chain
73
+
74
+
75
+ # -
76
+
77
+ def create_hf_model(model_name = "declare-lab/flan-alpaca-large"):
78
+
79
+ model_id = model_name
80
+ torch_device = "cuda" if torch.cuda.is_available() else "cpu"
81
+ print("Running on device:", torch_device)
82
+ print("CPU threads:", torch.get_num_threads())
83
+
84
+
85
+
86
+ if torch_device == "cuda":
87
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")
88
+ else:
89
+ #torch.set_num_threads(8)
90
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
91
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
92
+
93
+ return model, tokenizer, torch_device
94
+
95
+ # +
96
+ # Add some stopwords to nltk default
97
+
98
+ nltk.download('stopwords')
99
+ stopwords = nltk.corpus.stopwords.words('english')
100
+ #print(stopwords.words('english'))
101
+ newStopWords = ['what','how', 'when', 'which', 'who', 'change', 'changed', 'do', 'did', 'increase', 'decrease', 'increased',
102
+ 'decreased', 'proportion', 'percentage', 'report', 'reporting','say', 'said']
103
+ stopwords.extend(newStopWords)
104
+ # -
105
+
106
+ # Embeddings
107
+ #model_name = "sentence-transformers/all-MiniLM-L6-v2"
108
+ #embeddings = HuggingFaceEmbeddings(model_name=model_name)
109
+ embed_model_name = "hkunlp/instructor-large"
110
+ embeddings = HuggingFaceInstructEmbeddings(model_name=embed_model_name)
111
+ vectorstore = get_faiss_store(faiss_vstore_folder="faiss_lambeth_census_embedding",embeddings=embeddings)
112
+
113
+ # +
114
+ # Models
115
+
116
+ #checkpoint = 'declare-lab/flan-alpaca-base' # Flan Alpaca Base incorrectly interprets text based on input (e.g. if you use words like increase or decrease in the question it will respond falsely often). Flan Alpaca Large is much more consistent
117
+ checkpoint = 'declare-lab/flan-alpaca-large'
118
+
119
+ model, tokenizer, torch_device = create_hf_model(model_name = checkpoint)
120
+
121
+
122
+ # Look at this for streaming text with huggingface and langchain (last example): https://github.com/hwchase17/langchain/issues/2918
123
+
124
+ streamer = TextStreamer(tokenizer, skip_prompt=True)
125
+
126
+ pipe = pipeline('text2text-generation',
127
+ model = checkpoint,
128
+ # tokenizer = tokenizer,
129
+ max_length=512,
130
+ #do_sample=True,
131
+ temperature=0.000001,
132
+ #top_p=0.95,
133
+ #repetition_penalty=1.15,
134
+ accelerator="bettertransformer",
135
+ streamer=streamer
136
+ )
137
+
138
+ checkpoint_keywords = 'ml6team/keyphrase-generation-t5-small-inspec'
139
+
140
+ keyword_model = pipeline('text2text-generation',
141
+ model = checkpoint_keywords,
142
+ accelerator="bettertransformer"
143
+ )
144
+
145
+
146
+ # -
147
+
148
+ # # Chat history
149
+
150
+ def clear_chat(chat_history_state, sources, chat_message):
151
+ chat_history_state = []
152
+ sources = ''
153
+ chat_message = ''
154
+ return chat_history_state, sources, chat_message
155
+
156
+
157
+ def _get_chat_history(chat_history: List[Tuple[str, str]]): # Limit to last 3 interactions only
158
+ max_chat_length = 3
159
+
160
+ if len(chat_history) > max_chat_length:
161
+ chat_history = chat_history[-max_chat_length:]
162
+
163
+ print(chat_history)
164
+
165
+ first_q = ""
166
+ for human_s, ai_s in chat_history:
167
+ first_q = human_s
168
+ break
169
+
170
+ conversation = ""
171
+ for human_s, ai_s in chat_history:
172
+ human = f"Human: " + human_s
173
+ ai = f"Assistant: " + ai_s
174
+ conversation += "\n" + "\n".join([human, ai])
175
+
176
+ return conversation, first_q
177
+
178
+
179
+ def adapt_q_from_chat_history(keyword_model, new_question_keywords, question, chat_history):
180
+ t5_small_keyphrase = HuggingFacePipeline(pipeline=keyword_model)
181
+ memory_llm = t5_small_keyphrase#flan_alpaca#flan_t5_xxl
182
+ new_q_memory_llm = t5_small_keyphrase#flan_alpaca#flan_t5_xxl
183
+
184
+
185
+ memory_prompt = PromptTemplate(
186
+ template = "{chat_history_first_q}",
187
+ input_variables=["chat_history_first_q"]
188
+ )
189
+ #template = "Extract the names of people, things, or places from the following text: {chat_history}",#\n Original question: {question}\n New list:",
190
+ #template = "Extract keywords, and the names of people or places from the following text: {chat_history}",#\n Original question: {question}\n New list:",
191
+ #\n Original question: {question}\n New list:",
192
+
193
+
194
+ #example_prompt=_eg_prompt,
195
+ #input_variables=["question", "chat_history"]
196
+ #input_variables=["chat_history"]
197
+
198
+ memory_extractor = LLMChain(llm=memory_llm, prompt=memory_prompt)
199
+
200
+ #new_question_keywords = #remove_stopwords(question)
201
+
202
+ print("new_question_keywords:")
203
+ print(new_question_keywords)
204
+
205
+ chat_history_str, chat_history_first_q = _get_chat_history(chat_history)
206
+ if chat_history_str:
207
+
208
+ extracted_memory = memory_extractor.run(
209
+ chat_history_first_q=chat_history_first_q # question=question, chat_history=chat_history_str,
210
+ )
211
+
212
+ new_question_kworded = extracted_memory + " " + new_question_keywords
213
+ new_question = extracted_memory + " " + question
214
+
215
+ else:
216
+ new_question = question
217
+ new_question_kworded = new_question_keywords
218
+
219
+ return new_question, new_question_kworded
220
+
221
+
222
+ # # Prompt creation
223
+
224
+ def remove_q_stopwords(question):
225
+ # Prepare question by removing keywords
226
+ text = question.lower()
227
+ text_tokens = word_tokenize(text)
228
+ tokens_without_sw = [word for word in text_tokens if not word in stopwords]
229
+ new_question_keywords = ' '.join(tokens_without_sw)
230
+ return new_question_keywords, question
231
+
232
+
233
+ def create_final_prompt(inputs: Dict[str, str], vectorstore, instruction_prompt, content_prompt):
234
+
235
+ question = inputs["question"]
236
+ chat_history = inputs["chat_history"]
237
+
238
+ new_question_keywords, question = remove_q_stopwords(question)
239
+
240
+ new_question, new_question_kworded = adapt_q_from_chat_history(keyword_model, new_question_keywords, question, chat_history)
241
+
242
+
243
+ print("The question passed to the vector search is:")
244
+ print(new_question_kworded)
245
+
246
+ docs_keep_as_doc, docs_content, docs_url = find_relevant_passages(new_question_kworded, embeddings, k_val = 3, out_passages = 2, vec_score_cut_off = 1.3, vec_weight = 1, tfidf_weight = 0.5, svm_weight = 1)
247
+
248
+ if docs_keep_as_doc == []:
249
+ {"answer": "I'm sorry, I couldn't find a relevant answer to this question.", "sources":"I'm sorry, I couldn't find a relevant source for this question."}
250
+
251
+ #new_inputs = inputs.copy()
252
+ #new_inputs["question"] = new_question
253
+ #new_inputs["chat_history"] = chat_history_str
254
+
255
+ string_docs_content = '\n\n\n'.join(docs_content)
256
+
257
+ #print("The draft instruction prompt is:")
258
+ #print(instruction_prompt)
259
+
260
+ instruction_prompt_out = instruction_prompt.format(question=new_question, summaries=string_docs_content)
261
+ #print("The final instruction prompt:")
262
+ #print(instruction_prompt_out)
263
+
264
+
265
+ return instruction_prompt_out, string_docs_content
266
+
267
+
268
+ # +
269
+ def create_prompt_templates():
270
+
271
+ #EXAMPLE_PROMPT = PromptTemplate(
272
+ # template="\nCONTENT:\n\n{page_content}\n\nSOURCE: {source}\n\n",
273
+ # input_variables=["page_content", "source"],
274
+ #)
275
+
276
+ CONTENT_PROMPT = PromptTemplate(
277
+ template="{page_content}\n\n",#\n\nSOURCE: {source}\n\n",
278
+ input_variables=["page_content"]
279
+ )
280
+
281
+
282
+ # The main prompt:
283
+
284
+ #main_prompt_template = """
285
+ #Answer the question using the CONTENT below:
286
+
287
+ #CONTENT: {summaries}
288
+
289
+ #QUESTION: {question}
290
+
291
+ #ANSWER: """
292
+
293
+ instruction_prompt_template = """
294
+ {summaries}
295
+
296
+ QUESTION: {question}
297
+
298
+ Quote relevant text above."""
299
+
300
+
301
+ INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template, input_variables=['question', 'summaries'])
302
+
303
+ return INSTRUCTION_PROMPT, CONTENT_PROMPT
304
+
305
+
306
+ # -
307
+
308
+ def get_history_sources_final_input_prompt(user_input, history):
309
+
310
+ #if chain_agent is None:
311
+ # history.append((user_input, "Please click the button to submit the Huggingface API key before using the chatbot (top right)"))
312
+ # return history, history, "", ""
313
+ print("\n==== date/time: " + str(datetime.datetime.now()) + " ====")
314
+ print("User input: " + user_input)
315
+
316
+ history = history or []
317
+
318
+
319
+
320
+ # Create instruction prompt
321
+ instruction_prompt, content_prompt = create_prompt_templates()
322
+ instruction_prompt_out, string_docs_content =\
323
+ create_final_prompt({"question": user_input, "chat_history": history}, vectorstore,
324
+ instruction_prompt, content_prompt)
325
+
326
+ sources_txt = string_docs_content
327
+
328
+ #print('sources_txt:')
329
+ #print(sources_txt)
330
+
331
+ history.append(user_input)
332
+
333
+ print("Output history is:")
334
+ print(history)
335
+
336
+ print("The output prompt is:")
337
+ print(instruction_prompt_out)
338
+
339
+ return history, sources_txt, instruction_prompt_out
340
+
341
+
342
+ # # Chat functions
343
+
344
+ def produce_streaming_answer_chatbot(history, full_prompt):
345
+
346
+ print("The question is: ")
347
+ print(full_prompt)
348
+
349
+ # Get the model and tokenizer, and tokenize the user text.
350
+ model_inputs = tokenizer(text=full_prompt, return_tensors="pt").to(torch_device)
351
+
352
+ # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
353
+ # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
354
+ streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
355
+ generate_kwargs = dict(
356
+ model_inputs,
357
+ streamer=streamer,
358
+ max_new_tokens=512,
359
+ do_sample=True,
360
+ #top_p=top_p,
361
+ temperature=float(0.00001)#,
362
+ #top_k=top_k
363
+ )
364
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
365
+ t.start()
366
+
367
+ # Pull the generated text from the streamer, and update the model output.
368
+
369
+ history[-1][1] = ""
370
+ for new_text in streamer:
371
+ history[-1][1] += new_text
372
+ yield history
373
+
374
+
375
+ def user(user_message, history):
376
+ return gr.update(value="", interactive=False), history + [[user_message, None]]
377
+
378
+
379
+ def add_inputs_answer_to_history(user_message, history):
380
+ #history.append((user_message, [-1]))
381
+
382
+ print("History after appending is:")
383
+ print(history)
384
+
385
+
386
+ return history
387
+
388
+
389
+ # # Vector / hybrid search
390
+
391
+ def find_relevant_passages(new_question_kworded, embeddings, k_val, out_passages, vec_score_cut_off, vec_weight, tfidf_weight, svm_weight, vectorstore=vectorstore):
392
+
393
+ docs = vectorstore.similarity_search_with_score(new_question_kworded, k=k_val)
394
+ #docs = self.vstore.similarity_search_with_score(new_question_kworded, k=k_val)
395
+
396
+ # Keep only documents with a certain score
397
+ #docs_orig = [x[0] for x in docs]
398
+ docs_scores = [x[1] for x in docs]
399
+
400
+ # Only keep sources that are sufficiently relevant (i.e. similarity search score below threshold below)
401
+ score_more_limit = pd.Series(docs_scores) < vec_score_cut_off
402
+ docs_keep = list(compress(docs, score_more_limit))
403
+
404
+ if docs_keep == []:
405
+ docs_keep_as_doc = []
406
+ docs_content = []
407
+ docs_url = []
408
+ return docs_keep_as_doc, docs_content, docs_url
409
+
410
+
411
+
412
+ docs_keep_as_doc = [x[0] for x in docs_keep]
413
+ docs_keep_length = len(docs_keep_as_doc)
414
+
415
+ #print('docs_keep:')
416
+ #print(docs_keep)
417
+
418
+ vec_rank = [*range(1, docs_keep_length+1)]
419
+ vec_score = [(docs_keep_length/x)*vec_weight for x in vec_rank]
420
+
421
+ #print("vec_rank")
422
+ #print(vec_rank)
423
+
424
+ #print("vec_score")
425
+ #print(vec_score)
426
+
427
+
428
+
429
+ # 2nd level check on retrieved docs with TFIDF
430
+ content_keep=[]
431
+ for item in docs_keep:
432
+ content_keep.append(item[0].page_content)
433
+
434
+ tfidf_retriever = TFIDFRetriever.from_texts(content_keep, k = k_val)
435
+ tfidf_result = tfidf_retriever.get_relevant_documents(new_question_kworded)
436
+
437
+ #print("TDIDF retriever result:")
438
+ #print(tfidf_result)
439
+
440
+ tfidf_rank=[]
441
+ tfidf_score = []
442
+
443
+ for vec_item in docs_keep:
444
+ x = 0
445
+ for tfidf_item in tfidf_result:
446
+ x = x + 1
447
+ if tfidf_item.page_content == vec_item[0].page_content:
448
+ tfidf_rank.append(x)
449
+ tfidf_score.append((docs_keep_length/x)*tfidf_weight)
450
+
451
+ #print("tfidf_rank:")
452
+ #print(tfidf_rank)
453
+ #print("tfidf_score:")
454
+ #print(tfidf_score)
455
+
456
+
457
+ # 3rd level check on retrieved docs with SVM retriever
458
+ svm_retriever = SVMRetriever.from_texts(content_keep, embeddings, k = k_val)
459
+ svm_result = svm_retriever.get_relevant_documents(new_question_kworded)
460
+
461
+ #print("SVM retriever result:")
462
+ #print(svm_result)
463
+
464
+ svm_rank=[]
465
+ svm_score = []
466
+
467
+ for vec_item in docs_keep:
468
+ x = 0
469
+ for svm_item in svm_result:
470
+ x = x + 1
471
+ if svm_item.page_content == vec_item[0].page_content:
472
+ svm_rank.append(x)
473
+ svm_score.append((docs_keep_length/x)*svm_weight)
474
+
475
+ #print("svm_score:")
476
+ #print(svm_score)
477
+
478
+
479
+ ## Calculate final score based on three ranking methods
480
+ final_score = [a + b + c for a, b, c in zip(vec_score, tfidf_score, svm_score)]
481
+ final_rank = [sorted(final_score, reverse=True).index(x)+1 for x in final_score]
482
+
483
+ #print("Final score:")
484
+ #print(final_score)
485
+ #print("final rank:")
486
+ #print(final_rank)
487
+
488
+ best_rank_index_pos = []
489
+
490
+ for x in range(1,out_passages+1):
491
+ try:
492
+ best_rank_index_pos.append(final_rank.index(x))
493
+ except IndexError: # catch the error
494
+ pass
495
+
496
+ # Adjust best_rank_index_pos to
497
+
498
+ #print("Best rank positions in original vector search list:")
499
+ #print(best_rank_index_pos)
500
+
501
+ best_rank_pos_series = pd.Series(best_rank_index_pos)
502
+ #docs_keep_out = list(compress(docs_keep, best_rank_pos_series))
503
+
504
+ #print("docs_keep:")
505
+ #print(docs_keep)
506
+
507
+ docs_keep_out = [docs_keep[i] for i in best_rank_index_pos]
508
+
509
+
510
+ #docs_keep = [(docs_keep[best_rank_pos])]
511
+ # Keep only 'best' options
512
+ docs_keep_as_doc = [x[0] for x in docs_keep_out]# [docs_keep_as_doc_filt[0]]#[x[0] for x in docs_keep_as_doc_filt] #docs_keep_as_doc_filt[0]#
513
+
514
+ #print("docs_keep_out:")
515
+ #print(docs_keep_out)
516
+
517
+ # Extract content and metadata from 'winning' passages.
518
+
519
+ content=[]
520
+ meta_url=[]
521
+ score=[]
522
+
523
+ for item in docs_keep_out:
524
+ content.append(item[0].page_content)
525
+ meta_url.append(item[0].metadata['source'])
526
+ score.append(item[1])
527
+
528
+ # Create df from 'winning' passages
529
+
530
+ doc_df = pd.DataFrame(list(zip(content, meta_url, score)),
531
+ columns =['page_content', 'meta_url', 'score'])#.iloc[[0, 1]]
532
+
533
+ #print("docs_keep_as_doc: ")
534
+ #print(docs_keep_as_doc)
535
+
536
+ #print("doc_df")
537
+ #print(doc_df)
538
+
539
+ docs_content = doc_df['page_content'].astype(str)
540
+ docs_url = "https://" + doc_df['meta_url']
541
+
542
+ #print("Docs meta url is: ")
543
+ #print(docs_meta_url)
544
+
545
+ #print("Docs content is: ")
546
+ #print(docs_content)
547
+
548
+ #docs_url = [d['source'] for d in docs_meta]
549
+ #print(docs_url)
550
+
551
+
552
+
553
+ return docs_keep_as_doc, docs_content, docs_url
chatfuncs/.ipynb_checkpoints/ingest-checkpoint.py ADDED
@@ -0,0 +1,509 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ---
2
+ # jupyter:
3
+ # jupytext:
4
+ # formats: ipynb,py:light
5
+ # text_representation:
6
+ # extension: .py
7
+ # format_name: light
8
+ # format_version: '1.5'
9
+ # jupytext_version: 1.14.6
10
+ # kernelspec:
11
+ # display_name: Python 3 (ipykernel)
12
+ # language: python
13
+ # name: python3
14
+ # ---
15
+
16
+ # # Ingest website to FAISS
17
+
18
+ # ## Install/ import stuff we need
19
+
20
+ import os
21
+ from pathlib import Path
22
+ import re
23
+ import requests
24
+ import pandas as pd
25
+ import dateutil.parser
26
+ from typing import TypeVar, List
27
+
28
+ from langchain.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings
29
+ from langchain.vectorstores.faiss import FAISS
30
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
31
+ from langchain.docstore.document import Document
32
+ from langchain.document_loaders import PyPDFLoader
33
+
34
+ import magic
35
+ from bs4 import BeautifulSoup
36
+ from docx import Document as Doc
37
+ from pypdf import PdfReader
38
+ from docx import Document
39
+
40
+ PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
41
+ # -
42
+
43
+ split_strat = [".", "!", "?", "\n\n", "\n", ",", " ", ""]
44
+ chunk_size = 1000
45
+ chunk_overlap = 200
46
+
47
+ ## Overarching ingest function:
48
+
49
+
50
+ def determine_file_type(file_path):
51
+ """
52
+ Determine the MIME type of the given file using the magic library.
53
+
54
+ Parameters:
55
+ file_path (str): Path to the file.
56
+
57
+ Returns:
58
+ str: MIME type of the file.
59
+ """
60
+ return magic.from_file(file_path, mime=True)
61
+
62
+ def parse_pdf(file) -> List[str]:
63
+
64
+ """
65
+ Extract text from a PDF file.
66
+
67
+ Parameters:
68
+ file_path (str): Path to the PDF file.
69
+
70
+ Returns:
71
+ List[str]: Extracted text from the PDF.
72
+ """
73
+
74
+ output = []
75
+ for i in range(0,len(file)):
76
+ print(file[i].name)
77
+ pdf = PdfReader(file[i].name) #[i]
78
+ for page in pdf.pages:
79
+ text = page.extract_text()
80
+ # Merge hyphenated words
81
+ text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
82
+ # Fix newlines in the middle of sentences
83
+ text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
84
+ # Remove multiple newlines
85
+ text = re.sub(r"\n\s*\n", "\n\n", text)
86
+ output.append(text)
87
+ return output
88
+
89
+
90
+ def parse_docx(file_path):
91
+ """
92
+ Reads the content of a .docx file and returns it as a string.
93
+
94
+ Parameters:
95
+ - file_path (str): Path to the .docx file.
96
+
97
+ Returns:
98
+ - str: Content of the .docx file.
99
+ """
100
+ doc = Doc(file_path)
101
+ full_text = []
102
+ for para in doc.paragraphs:
103
+ full_text.append(para.text)
104
+ return '\n'.join(full_text)
105
+
106
+
107
+ def parse_txt(file_path):
108
+ """
109
+ Read text from a TXT or HTML file.
110
+
111
+ Parameters:
112
+ file_path (str): Path to the TXT or HTML file.
113
+
114
+ Returns:
115
+ str: Text content of the file.
116
+ """
117
+ with open(file_path, 'r', encoding="utf-8") as file:
118
+ return file.read()
119
+
120
+
121
+
122
+ def parse_file(file_paths):
123
+ """
124
+ Accepts a list of file paths, determines each file's type,
125
+ and passes it to the relevant parsing function.
126
+
127
+ Parameters:
128
+ file_paths (list): List of file paths.
129
+
130
+ Returns:
131
+ dict: A dictionary with file paths as keys and their parsed content (or error message) as values.
132
+ """
133
+ if not isinstance(file_paths, list):
134
+ raise ValueError("Expected a list of file paths.")
135
+
136
+ mime_type_to_parser = {
137
+ 'application/pdf': parse_pdf,
138
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': parse_docx,
139
+ 'text/plain': parse_txt,
140
+ 'text/html': parse_html
141
+ }
142
+
143
+ parsed_contents = {}
144
+
145
+ for file_path in file_paths:
146
+ mime_type = determine_file_type(file_path)
147
+ if mime_type in mime_type_to_parser:
148
+ parsed_contents[file_path] = mime_type_to_parser[mime_type](file_path)
149
+ else:
150
+ parsed_contents[file_path] = f"Unsupported file type: {mime_type}"
151
+
152
+ return parsed_contents
153
+
154
+
155
+
156
+
157
+ def parse_html(page_url, div_filter="p"):
158
+ """
159
+ Determine if the source is a web URL or a local HTML file, extract the content based on the div of choice. Also tries to extract dates (WIP)
160
+
161
+ Parameters:
162
+ page_url (str): The web URL or local file path.
163
+
164
+ Returns:
165
+ str: Extracted content.
166
+ """
167
+
168
+ def is_web_url(s):
169
+ """
170
+ Check if the input string is a web URL.
171
+ """
172
+ return s.startswith("http://") or s.startswith("https://")
173
+
174
+ def is_local_html_file(s):
175
+ """
176
+ Check if the input string is a path to a local HTML file.
177
+ """
178
+ return (s.endswith(".html") or s.endswith(".htm")) and os.path.isfile(s)
179
+
180
+ def extract_text_from_source(source):
181
+ """
182
+ Determine if the source is a web URL or a local HTML file,
183
+ and then extract its content accordingly.
184
+
185
+ Parameters:
186
+ source (str): The web URL or local file path.
187
+
188
+ Returns:
189
+ str: Extracted content.
190
+ """
191
+ if is_web_url(source):
192
+ response = requests.get(source)
193
+ response.raise_for_status() # Raise an HTTPError for bad responses
194
+ return response.text
195
+ elif is_local_html_file(source):
196
+ with open(source, 'r', encoding='utf-8') as file:
197
+ return file.read()
198
+ else:
199
+ raise ValueError("Input is neither a valid web URL nor a local HTML file path.")
200
+
201
+ def clean_html_data(data, date_filter="", div_filt="p"):
202
+ """
203
+ Extracts and cleans data from HTML content.
204
+
205
+ Parameters:
206
+ data (str): HTML content to be parsed.
207
+ date_filter (str, optional): Date string to filter results. If set, only content with a date greater than this will be returned.
208
+ div_filt (str, optional): HTML tag to search for text content. Defaults to "p".
209
+
210
+ Returns:
211
+ tuple: Contains extracted text and date as strings. Returns empty strings if not found.
212
+ """
213
+
214
+ soup = BeautifulSoup(data, 'html.parser')
215
+
216
+ # Function to exclude div with id "bar"
217
+ def exclude_div_with_id_bar(tag):
218
+ return tag.has_attr('id') and tag['id'] == 'related-links'
219
+
220
+ text_elements = soup.find_all(div_filt)
221
+ date_elements = soup.find_all(div_filt, {"class": "page-neutral-intro__meta"})
222
+
223
+ # Extract date
224
+ date_out = ""
225
+ if date_elements:
226
+ date_out = re.search(">(.*?)<", str(date_elements[0])).group(1)
227
+ date_dt = dateutil.parser.parse(date_out)
228
+
229
+ if date_filter:
230
+ date_filter_dt = dateutil.parser.parse(date_filter)
231
+ if date_dt < date_filter_dt:
232
+ return '', date_out
233
+
234
+ # Extract text
235
+ text_out_final = ""
236
+ if text_elements:
237
+ text_out_final = '\n'.join(paragraph.text for paragraph in text_elements)
238
+ else:
239
+ print(f"No elements found with tag '{div_filt}'. No text returned.")
240
+
241
+ return text_out_final, date_out
242
+
243
+
244
+ #page_url = "https://pypi.org/project/InstructorEmbedding/" #'https://www.ons.gov.uk/visualisations/censusareachanges/E09000022/index.html'
245
+
246
+ html_text = extract_text_from_source(page_url)
247
+ #print(page.text)
248
+
249
+ texts = []
250
+ metadatas = []
251
+
252
+ clean_text, date = clean_html_data(html_text, date_filter="", div_filt=div_filter)
253
+ texts.append(clean_text)
254
+ metadatas.append({"source": page_url, "date":str(date)})
255
+
256
+ return texts, metadatas
257
+
258
+
259
+ # +
260
+ # Convert parsed text to docs
261
+ # -
262
+
263
+ def text_to_docs(text_dict: dict, chunk_size: int = chunk_size) -> List[Document]:
264
+ """
265
+ Converts the output of parse_file (a dictionary of file paths to content)
266
+ to a list of Documents with metadata.
267
+ """
268
+
269
+ doc_chunks = []
270
+
271
+ for file_path, content in text_dict.items():
272
+ ext = os.path.splitext(file_path)[1].lower()
273
+
274
+ # Depending on the file extension, handle the content
275
+ if ext == '.pdf':
276
+ docs = pdf_text_to_docs(content, chunk_size)
277
+ elif ext in ['.html', '.htm', '.txt', '.docx']:
278
+ # Assuming you want to process HTML similarly to PDF in this context
279
+ docs = html_text_to_docs(content, chunk_size)
280
+ else:
281
+ print(f"Unsupported file type {ext} for {file_path}. Skipping.")
282
+ continue
283
+
284
+ # Add filename as metadata
285
+ for doc in docs:
286
+ doc.metadata["file"] = file_path
287
+
288
+ doc_chunks.extend(docs)
289
+
290
+ return doc_chunks
291
+
292
+
293
+
294
+ def pdf_text_to_docs(text: str, chunk_size: int = chunk_size) -> List[Document]:
295
+ """Converts a string or list of strings to a list of Documents
296
+ with metadata."""
297
+ if isinstance(text, str):
298
+ # Take a single string as one page
299
+ text = [text]
300
+
301
+ page_docs = [Document(page_content=page) for page in text]
302
+
303
+ # Add page numbers as metadata
304
+ for i, doc in enumerate(page_docs):
305
+ doc.metadata["page"] = i + 1
306
+
307
+ # Split pages into chunks
308
+ doc_chunks = []
309
+
310
+ for doc in page_docs:
311
+ text_splitter = RecursiveCharacterTextSplitter(
312
+ chunk_size=chunk_size,
313
+ separators=split_strat,#["\n\n", "\n", ".", "!", "?", ",", " ", ""],
314
+ chunk_overlap=chunk_overlap,
315
+ )
316
+ chunks = text_splitter.split_text(doc.page_content)
317
+
318
+
319
+ for i, chunk in enumerate(chunks):
320
+ doc = Document(
321
+ page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
322
+ )
323
+ # Add sources a metadata
324
+ doc.metadata["page_chunk"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
325
+ doc_chunks.append(doc)
326
+ return doc_chunks
327
+
328
+ def html_text_to_docs(texts, metadatas, chunk_size:int = chunk_size):
329
+
330
+ text_splitter = RecursiveCharacterTextSplitter(
331
+ separators=split_strat,#["\n\n", "\n", ".", "!", "?", ",", " ", ""],
332
+ chunk_size=chunk_size,
333
+ chunk_overlap=chunk_overlap,
334
+ length_function=len
335
+ )
336
+
337
+ #print(texts)
338
+ #print(metadatas)
339
+
340
+ documents = text_splitter.create_documents(texts, metadatas=metadatas)
341
+
342
+ for i, chunk in enumerate(documents):
343
+ chunk.metadata["chunk"] = i + 1
344
+
345
+ return documents
346
+
347
+
348
+
349
+
350
+
351
+
352
+ # # Functions for working with documents after loading them back in
353
+
354
+ def pull_out_data(series):
355
+
356
+ # define a lambda function to convert each string into a tuple
357
+ to_tuple = lambda x: eval(x)
358
+
359
+ # apply the lambda function to each element of the series
360
+ series_tup = series.apply(to_tuple)
361
+
362
+ series_tup_content = list(zip(*series_tup))[1]
363
+
364
+ series = pd.Series(list(series_tup_content))#.str.replace("^Main post content", "", regex=True).str.strip()
365
+
366
+ return series
367
+
368
+
369
+ def docs_from_csv(df):
370
+
371
+ import ast
372
+
373
+ documents = []
374
+
375
+ page_content = pull_out_data(df["0"])
376
+ metadatas = pull_out_data(df["1"])
377
+
378
+ for x in range(0,len(df)):
379
+ new_doc = Document(page_content=page_content[x], metadata=metadatas[x])
380
+ documents.append(new_doc)
381
+
382
+ return documents
383
+
384
+
385
+ def docs_from_lists(docs, metadatas):
386
+
387
+ documents = []
388
+
389
+ for x, doc in enumerate(docs):
390
+ new_doc = Document(page_content=doc, metadata=metadatas[x])
391
+ documents.append(new_doc)
392
+
393
+ return documents
394
+
395
+
396
+ def docs_elements_from_csv_save(docs_path="documents.csv"):
397
+
398
+ documents = pd.read_csv(docs_path)
399
+
400
+ docs_out = docs_from_csv(documents)
401
+
402
+ out_df = pd.DataFrame(docs_out)
403
+
404
+ docs_content = pull_out_data(out_df[0].astype(str))
405
+
406
+ docs_meta = pull_out_data(out_df[1].astype(str))
407
+
408
+ doc_sources = [d['source'] for d in docs_meta]
409
+
410
+ return out_df, docs_content, docs_meta, doc_sources
411
+
412
+
413
+ # documents = html_text_to_docs(texts, metadatas)
414
+ #
415
+ # documents[0]
416
+ #
417
+ # pd.DataFrame(documents).to_csv("documents.csv", index=None)
418
+
419
+ # ## Create embeddings and save faiss vector store to the path specified in `save_to`
420
+
421
+ def load_embeddings(model_name = "hkunlp/instructor-large"):
422
+
423
+ if model_name == "hkunlp/instructor-large":
424
+ embeddings_func = HuggingFaceInstructEmbeddings(model_name=model_name,
425
+ embed_instruction="Represent the paragraph for retrieval: ",
426
+ query_instruction="Represent the question for retrieving supporting documents: "
427
+ )
428
+
429
+ else:
430
+ embeddings_func = HuggingFaceEmbeddings(model_name=model_name)
431
+
432
+ global embeddings
433
+
434
+ embeddings = embeddings_func
435
+
436
+ #return embeddings_func
437
+
438
+
439
+ def embed_faiss_save_to_zip(docs_out, save_to="faiss_lambeth_census_embedding", model_name = "hkunlp/instructor-large"):
440
+
441
+ load_embeddings(model_name=model_name)
442
+
443
+ #embeddings_fast = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
444
+
445
+ print(f"> Total split documents: {len(docs_out)}")
446
+
447
+ vectorstore = FAISS.from_documents(documents=docs_out, embedding=embeddings)
448
+
449
+
450
+ if Path(save_to).exists():
451
+ vectorstore.save_local(folder_path=save_to)
452
+
453
+ print("> DONE")
454
+ print(f"> Saved to: {save_to}")
455
+
456
+ ### Save as zip, then remove faiss/pkl files to allow for upload to huggingface
457
+
458
+ import shutil
459
+
460
+ shutil.make_archive(save_to, 'zip', save_to)
461
+
462
+ os.remove(save_to + "/index.faiss")
463
+ os.remove(save_to + "/index.pkl")
464
+
465
+ shutil.move(save_to + '.zip', save_to + "/" + save_to + '.zip')
466
+
467
+ return vectorstore
468
+
469
+
470
+ # +
471
+ # https://colab.research.google.com/drive/1RWqGXd2B6sPchlYVihKaBSsHy9zWRcYF#scrollTo=Q_eTIZwf4Dk2
472
+
473
+ def docs_to_chroma_save(embeddings, docs_out:PandasDataFrame, save_to:str):
474
+ print(f"> Total split documents: {len(docs_out)}")
475
+
476
+ vectordb = Chroma.from_documents(documents=docs_out,
477
+ embedding=embeddings,
478
+ persist_directory=save_to)
479
+
480
+ # persiste the db to disk
481
+ vectordb.persist()
482
+
483
+ print("> DONE")
484
+ print(f"> Saved to: {save_to}")
485
+
486
+ return vectordb
487
+
488
+
489
+ # + [markdown] jp-MarkdownHeadingCollapsed=true
490
+ # ## Similarity search on saved vectorstore
491
+ # -
492
+
493
+ def sim_search_local_saved_vec(query, k_val, save_to="faiss_lambeth_census_embedding"):
494
+
495
+ load_embeddings()
496
+
497
+ docsearch = FAISS.load_local(folder_path=save_to, embeddings=embeddings)
498
+
499
+
500
+ display(Markdown(question))
501
+
502
+ search = docsearch.similarity_search_with_score(query, k=k_val)
503
+
504
+ for item in search:
505
+ print(item[0].page_content)
506
+ print(f"Page: {item[0].metadata['source']}")
507
+ print(f"Date: {item[0].metadata['date']}")
508
+ print(f"Score: {item[1]}")
509
+ print("---")
chatfuncs/__init__.py ADDED
File without changes
chatfuncs/chatfuncs.py ADDED
@@ -0,0 +1,968 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import datetime
3
+ from typing import TypeVar, Dict, List, Tuple
4
+ from itertools import compress
5
+ import pandas as pd
6
+ import numpy as np
7
+
8
+ # Model packages
9
+ import torch
10
+ from threading import Thread
11
+ from transformers import AutoTokenizer, pipeline, TextIteratorStreamer
12
+
13
+ # Alternative model sources
14
+ from gpt4all import GPT4All
15
+ from ctransformers import AutoModelForCausalLM
16
+
17
+ from dataclasses import asdict, dataclass
18
+
19
+ # Langchain functions
20
+ from langchain import PromptTemplate
21
+ from langchain.prompts import PromptTemplate
22
+ from langchain.vectorstores import FAISS
23
+ from langchain.retrievers import SVMRetriever
24
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
25
+ from langchain.docstore.document import Document
26
+
27
+ # For keyword extraction
28
+ import nltk
29
+ from nltk.corpus import stopwords
30
+ from nltk.tokenize import RegexpTokenizer
31
+ from nltk.stem import WordNetLemmatizer
32
+ import keybert
33
+ #from transformers.pipelines import pipeline
34
+
35
+ # For Name Entity Recognition model
36
+ from span_marker import SpanMarkerModel
37
+
38
+ # For BM25 retrieval
39
+ from gensim.corpora import Dictionary
40
+ from gensim.models import TfidfModel, OkapiBM25Model
41
+ from gensim.similarities import SparseMatrixSimilarity
42
+
43
+ import gradio as gr
44
+
45
+ torch_device = "cuda" if torch.cuda.is_available() else "cpu"
46
+ print("Running on device:", torch_device)
47
+ threads = torch.get_num_threads()
48
+ print("CPU threads:", threads)
49
+
50
+ PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
51
+
52
+ embeddings = None # global variable setup
53
+ vectorstore = None # global variable setup
54
+
55
+ full_text = "" # Define dummy source text (full text) just to enable highlight function to load
56
+
57
+ ctrans_llm = [] # Define empty list to hold CTrans LLMs for functions to run
58
+
59
+ temperature: float = 0.1
60
+ top_k: int = 3
61
+ top_p: float = 1
62
+ repetition_penalty: float = 1.05
63
+ last_n_tokens: int = 64
64
+ max_new_tokens: int = 125
65
+ #seed: int = 42
66
+ reset: bool = False
67
+ stream: bool = True
68
+ threads: int = threads
69
+ batch_size:int = 512
70
+ context_length:int = 2048
71
+ gpu_layers:int = 0
72
+ sample = False
73
+
74
+ ## Highlight text constants
75
+ hlt_chunk_size = 20
76
+ hlt_strat = [" ", ".", "!", "?", ":", "\n\n", "\n", ","]
77
+ hlt_overlap = 0
78
+
79
+ ## Initialise NER model ##
80
+ ner_model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-mbert-base-multinerd")
81
+
82
+ ## Initialise keyword model ##
83
+ # Used to pull out keywords from chat history to add to user queries behind the scenes
84
+ kw_model = pipeline("feature-extraction", model="thenlper/gte-base")
85
+
86
+ ## Chat models ##
87
+ ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/orca_mini_3B-GGML', model_type='llama', model_file='orca-mini-3b.ggmlv3.q4_0.bin')
88
+ #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/orca_mini_3B-GGML', model_type='llama', model_file='orca-mini-3b.ggmlv3.q8_0.bin')
89
+ #gpt4all_model = GPT4All(model_name= "orca-mini-3b.ggmlv3.q4_0.bin", model_path="models/") # "ggml-mpt-7b-chat.bin"
90
+
91
+ # Huggingface chat model
92
+ hf_checkpoint = 'declare-lab/flan-alpaca-large'
93
+
94
+ def create_hf_model(model_name):
95
+
96
+ from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM
97
+
98
+ # model_id = model_name
99
+
100
+ if torch_device == "cuda":
101
+ if "flan" in model_name:
102
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True, device_map="auto")
103
+ elif "mpt" in model_name:
104
+ model = AutoModelForCausalLM.from_pretrained(model_name, load_in_8bit=True, device_map="auto", trust_remote_code=True)
105
+ else:
106
+ model = AutoModelForCausalLM.from_pretrained(model_name, load_in_8bit=True, device_map="auto")
107
+ else:
108
+ if "flan" in model_name:
109
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
110
+ elif "mpt" in model_name:
111
+ model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
112
+ else:
113
+ model = AutoModelForCausalLM.from_pretrained(model_name)
114
+
115
+ tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = 2048)
116
+
117
+ return model, tokenizer, torch_device
118
+
119
+ model, tokenizer, torch_device = create_hf_model(model_name = hf_checkpoint)
120
+
121
+ # Vectorstore funcs
122
+
123
+ def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings=embeddings):
124
+
125
+ print(f"> Total split documents: {len(docs_out)}")
126
+
127
+ vectorstore_func = FAISS.from_documents(documents=docs_out, embedding=embeddings)
128
+
129
+ '''
130
+ #with open("vectorstore.pkl", "wb") as f:
131
+ #pickle.dump(vectorstore, f)
132
+ '''
133
+
134
+ #if Path(save_to).exists():
135
+ # vectorstore_func.save_local(folder_path=save_to)
136
+ #else:
137
+ # os.mkdir(save_to)
138
+ # vectorstore_func.save_local(folder_path=save_to)
139
+
140
+ global vectorstore
141
+
142
+ vectorstore = vectorstore_func
143
+
144
+ out_message = "Document processing complete"
145
+
146
+ #print(out_message)
147
+ #print(f"> Saved to: {save_to}")
148
+
149
+ return out_message
150
+
151
+ # # Prompt functions
152
+
153
+ def create_prompt_templates():
154
+
155
+ #EXAMPLE_PROMPT = PromptTemplate(
156
+ # template="\nCONTENT:\n\n{page_content}\n\nSOURCE: {source}\n\n",
157
+ # input_variables=["page_content", "source"],
158
+ #)
159
+
160
+ CONTENT_PROMPT = PromptTemplate(
161
+ template="{page_content}\n\n",#\n\nSOURCE: {source}\n\n",
162
+ input_variables=["page_content"]
163
+ )
164
+
165
+
166
+ # The main prompt:
167
+
168
+ instruction_prompt_template_alpaca_quote = """### Instruction:
169
+ Quote directly from the SOURCE below that best answers the QUESTION. Only quote full sentences in the correct order. If you cannot find an answer, start your response with "My best guess is: ".
170
+
171
+ CONTENT: {summaries}
172
+
173
+ QUESTION: {question}
174
+
175
+ Response:"""
176
+
177
+ instruction_prompt_template_orca = """
178
+ ### System:
179
+ You are an AI assistant that follows instruction extremely well. Help as much as you can.
180
+ ### User:
181
+ Answer the QUESTION using information from the following CONTENT.
182
+ CONTENT: {summaries}
183
+ QUESTION: {question}
184
+
185
+ ### Response:"""
186
+
187
+
188
+
189
+
190
+ INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_orca, input_variables=['question', 'summaries'])
191
+
192
+ return INSTRUCTION_PROMPT, CONTENT_PROMPT
193
+
194
+ def adapt_q_from_chat_history(question, chat_history, extracted_memory, keyword_model=""):#keyword_model): # new_question_keywords,
195
+
196
+ chat_history_str, chat_history_first_q, chat_history_first_ans, max_chat_length = _get_chat_history(chat_history)
197
+
198
+ if chat_history_str:
199
+ # Keyword extraction is now done in the add_inputs_to_history function
200
+ extracted_memory = extracted_memory#remove_q_stopwords(str(chat_history_first_q) + " " + str(chat_history_first_ans))
201
+
202
+
203
+ new_question_kworded = str(extracted_memory) + ". " + question #+ " " + new_question_keywords
204
+ #extracted_memory + " " + question
205
+
206
+ else:
207
+ new_question_kworded = question #new_question_keywords
208
+
209
+ #print("Question output is: " + new_question_kworded)
210
+
211
+ return new_question_kworded
212
+
213
+ def create_doc_df(docs_keep_out):
214
+ # Extract content and metadata from 'winning' passages.
215
+ content=[]
216
+ meta=[]
217
+ meta_url=[]
218
+ page_section=[]
219
+ score=[]
220
+
221
+ for item in docs_keep_out:
222
+ content.append(item[0].page_content)
223
+ meta.append(item[0].metadata)
224
+ meta_url.append(item[0].metadata['source'])
225
+ page_section.append(item[0].metadata['page_section'])
226
+ score.append(item[1])
227
+
228
+ # Create df from 'winning' passages
229
+
230
+ doc_df = pd.DataFrame(list(zip(content, meta, page_section, meta_url, score)),
231
+ columns =['page_content', 'metadata', 'page_section', 'meta_url', 'score'])
232
+
233
+ docs_content = doc_df['page_content'].astype(str)
234
+ doc_df['full_url'] = "https://" + doc_df['meta_url']
235
+
236
+ return doc_df
237
+
238
+ def hybrid_retrieval(new_question_kworded, k_val, out_passages,
239
+ vec_score_cut_off, vec_weight, bm25_weight, svm_weight): # ,vectorstore, embeddings
240
+
241
+ vectorstore=globals()["vectorstore"]
242
+ embeddings=globals()["embeddings"]
243
+
244
+
245
+ docs = vectorstore.similarity_search_with_score(new_question_kworded, k=k_val)
246
+
247
+ print("Docs from similarity search:")
248
+ print(docs)
249
+
250
+ # Keep only documents with a certain score
251
+ docs_len = [len(x[0].page_content) for x in docs]
252
+ docs_scores = [x[1] for x in docs]
253
+
254
+ # Only keep sources that are sufficiently relevant (i.e. similarity search score below threshold below)
255
+ score_more_limit = pd.Series(docs_scores) < vec_score_cut_off
256
+ docs_keep = list(compress(docs, score_more_limit))
257
+
258
+ if docs_keep == []:
259
+ docs_keep_as_doc = []
260
+ docs_content = []
261
+ docs_url = []
262
+ return docs_keep_as_doc, docs_content, docs_url
263
+
264
+ # Only keep sources that are at least 100 characters long
265
+ length_more_limit = pd.Series(docs_len) >= 100
266
+ docs_keep = list(compress(docs_keep, length_more_limit))
267
+
268
+ if docs_keep == []:
269
+ docs_keep_as_doc = []
270
+ docs_content = []
271
+ docs_url = []
272
+ return docs_keep_as_doc, docs_content, docs_url
273
+
274
+ docs_keep_as_doc = [x[0] for x in docs_keep]
275
+ docs_keep_length = len(docs_keep_as_doc)
276
+
277
+
278
+
279
+ if docs_keep_length == 1:
280
+
281
+ content=[]
282
+ meta_url=[]
283
+ score=[]
284
+
285
+ for item in docs_keep:
286
+ content.append(item[0].page_content)
287
+ meta_url.append(item[0].metadata['source'])
288
+ score.append(item[1])
289
+
290
+ # Create df from 'winning' passages
291
+
292
+ doc_df = pd.DataFrame(list(zip(content, meta_url, score)),
293
+ columns =['page_content', 'meta_url', 'score'])
294
+
295
+ docs_content = doc_df['page_content'].astype(str)
296
+ docs_url = doc_df['meta_url']
297
+
298
+ return docs_keep_as_doc, docs_content, docs_url
299
+
300
+ # Check for if more docs are removed than the desired output
301
+ if out_passages > docs_keep_length:
302
+ out_passages = docs_keep_length
303
+ k_val = docs_keep_length
304
+
305
+ vec_rank = [*range(1, docs_keep_length+1)]
306
+ vec_score = [(docs_keep_length/x)*vec_weight for x in vec_rank]
307
+
308
+ # 2nd level check on retrieved docs with BM25
309
+
310
+ content_keep=[]
311
+ for item in docs_keep:
312
+ content_keep.append(item[0].page_content)
313
+
314
+ corpus = corpus = [doc.lower().split() for doc in content_keep]
315
+ dictionary = Dictionary(corpus)
316
+ bm25_model = OkapiBM25Model(dictionary=dictionary)
317
+ bm25_corpus = bm25_model[list(map(dictionary.doc2bow, corpus))]
318
+ bm25_index = SparseMatrixSimilarity(bm25_corpus, num_docs=len(corpus), num_terms=len(dictionary),
319
+ normalize_queries=False, normalize_documents=False)
320
+ query = new_question_kworded.lower().split()
321
+ tfidf_model = TfidfModel(dictionary=dictionary, smartirs='bnn') # Enforce binary weighting of queries
322
+ tfidf_query = tfidf_model[dictionary.doc2bow(query)]
323
+ similarities = np.array(bm25_index[tfidf_query])
324
+ #print(similarities)
325
+ temp = similarities.argsort()
326
+ ranks = np.arange(len(similarities))[temp.argsort()][::-1]
327
+
328
+ # Pair each index with its corresponding value
329
+ pairs = list(zip(ranks, docs_keep_as_doc))
330
+ # Sort the pairs by the indices
331
+ pairs.sort()
332
+ # Extract the values in the new order
333
+ bm25_result = [value for ranks, value in pairs]
334
+
335
+ bm25_rank=[]
336
+ bm25_score = []
337
+
338
+ for vec_item in docs_keep:
339
+ x = 0
340
+ for bm25_item in bm25_result:
341
+ x = x + 1
342
+ if bm25_item.page_content == vec_item[0].page_content:
343
+ bm25_rank.append(x)
344
+ bm25_score.append((docs_keep_length/x)*bm25_weight)
345
+
346
+ # 3rd level check on retrieved docs with SVM retriever
347
+ svm_retriever = SVMRetriever.from_texts(content_keep, embeddings, k = k_val)
348
+ svm_result = svm_retriever.get_relevant_documents(new_question_kworded)
349
+
350
+
351
+ svm_rank=[]
352
+ svm_score = []
353
+
354
+ for vec_item in docs_keep:
355
+ x = 0
356
+ for svm_item in svm_result:
357
+ x = x + 1
358
+ if svm_item.page_content == vec_item[0].page_content:
359
+ svm_rank.append(x)
360
+ svm_score.append((docs_keep_length/x)*svm_weight)
361
+
362
+
363
+ ## Calculate final score based on three ranking methods
364
+ final_score = [a + b + c for a, b, c in zip(vec_score, bm25_score, svm_score)]
365
+ final_rank = [sorted(final_score, reverse=True).index(x)+1 for x in final_score]
366
+ # Force final_rank to increment by 1 each time
367
+ final_rank = list(pd.Series(final_rank).rank(method='first'))
368
+
369
+ #print("final rank: " + str(final_rank))
370
+ #print("out_passages: " + str(out_passages))
371
+
372
+ best_rank_index_pos = []
373
+
374
+ for x in range(1,out_passages+1):
375
+ try:
376
+ best_rank_index_pos.append(final_rank.index(x))
377
+ except IndexError: # catch the error
378
+ pass
379
+
380
+ # Adjust best_rank_index_pos to
381
+
382
+ best_rank_pos_series = pd.Series(best_rank_index_pos)
383
+
384
+
385
+ docs_keep_out = [docs_keep[i] for i in best_rank_index_pos]
386
+
387
+ # Keep only 'best' options
388
+ docs_keep_as_doc = [x[0] for x in docs_keep_out]
389
+
390
+ # Make df of best options
391
+ doc_df = create_doc_df(docs_keep_out)
392
+
393
+ return docs_keep_as_doc, doc_df, docs_keep_out
394
+
395
+ def get_expanded_passages(vectorstore, docs_keep_out, width):
396
+ """
397
+ Extracts expanded passages based on given documents and a width for context.
398
+
399
+ Parameters:
400
+ - vectorstore: The primary data source.
401
+ - docs_keep_out: List of documents to be expanded.
402
+ - width: Number of documents to expand around a given document for context.
403
+
404
+ Returns:
405
+ - expanded_docs: List of expanded Document objects.
406
+ - doc_df: DataFrame representation of expanded_docs.
407
+ """
408
+
409
+ def get_docs_from_vstore(vectorstore):
410
+ vector = vectorstore.docstore._dict
411
+ return list(vector.items())
412
+
413
+ def extract_details(docs_list):
414
+ docs_list_out = [tup[1] for tup in docs_list]
415
+ content = [doc.page_content for doc in docs_list_out]
416
+ meta = [doc.metadata for doc in docs_list_out]
417
+ return ''.join(content), meta[0], meta[-1]
418
+
419
+ def get_parent_content_and_meta(vstore_docs, width, target):
420
+ target_range = range(max(0, target - width), min(len(vstore_docs), target + width + 1))
421
+ parent_vstore_out = [vstore_docs[i] for i in target_range]
422
+
423
+ content_str_out, meta_first_out, meta_last_out = [], [], []
424
+ for _ in parent_vstore_out:
425
+ content_str, meta_first, meta_last = extract_details(parent_vstore_out)
426
+ content_str_out.append(content_str)
427
+ meta_first_out.append(meta_first)
428
+ meta_last_out.append(meta_last)
429
+ return content_str_out, meta_first_out, meta_last_out
430
+
431
+ def merge_dicts_except_source(d1, d2):
432
+ merged = {}
433
+ for key in d1:
434
+ if key != "source":
435
+ merged[key] = str(d1[key]) + " to " + str(d2[key])
436
+ else:
437
+ merged[key] = d1[key] # or d2[key], based on preference
438
+ return merged
439
+
440
+ def merge_two_lists_of_dicts(list1, list2):
441
+ return [merge_dicts_except_source(d1, d2) for d1, d2 in zip(list1, list2)]
442
+
443
+ vstore_docs = get_docs_from_vstore(vectorstore)
444
+ parent_vstore_meta_section = [doc.metadata['page_section'] for _, doc in vstore_docs]
445
+
446
+ expanded_docs = []
447
+ for doc, score in docs_keep_out:
448
+ search_section = doc.metadata['page_section']
449
+ search_index = parent_vstore_meta_section.index(search_section) if search_section in parent_vstore_meta_section else -1
450
+
451
+ content_str, meta_first, meta_last = get_parent_content_and_meta(vstore_docs, width, search_index)
452
+ meta_full = merge_two_lists_of_dicts(meta_first, meta_last)
453
+
454
+ print(meta_full)
455
+
456
+ expanded_doc = (Document(page_content=content_str[0], metadata=meta_full[0]), score)
457
+ expanded_docs.append(expanded_doc)
458
+
459
+ doc_df = create_doc_df(expanded_docs) # Assuming you've defined the 'create_doc_df' function elsewhere
460
+
461
+ return expanded_docs, doc_df
462
+
463
+ def create_final_prompt(inputs: Dict[str, str], instruction_prompt, content_prompt, extracted_memory): # ,
464
+
465
+ question = inputs["question"]
466
+ chat_history = inputs["chat_history"]
467
+
468
+
469
+ new_question_kworded = adapt_q_from_chat_history(question, chat_history, extracted_memory) # new_question_keywords,
470
+
471
+
472
+ #print("The question passed to the vector search is:")
473
+ #print(new_question_kworded)
474
+
475
+ #docs_keep_as_doc, docs_content, docs_url = find_relevant_passages(new_question_kworded, k_val = 5, out_passages = 3,
476
+ # vec_score_cut_off = 1.3, vec_weight = 1, tfidf_weight = 0.5, svm_weight = 1)
477
+
478
+ docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, k_val = 5, out_passages = 2,
479
+ vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)#,
480
+ #vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
481
+
482
+ # Expand the found passages to the neighbouring context
483
+ docs_keep_as_doc, doc_df = get_expanded_passages(vectorstore, docs_keep_out, width=1)
484
+
485
+ if docs_keep_as_doc == []:
486
+ {"answer": "I'm sorry, I couldn't find a relevant answer to this question.", "sources":"I'm sorry, I couldn't find a relevant source for this question."}
487
+
488
+ #new_inputs = inputs.copy()
489
+ #new_inputs["question"] = new_question
490
+ #new_inputs["chat_history"] = chat_history_str
491
+
492
+ #print(docs_url)
493
+ #print(doc_df['metadata'])
494
+
495
+ # Build up sources content to add to user display
496
+
497
+ doc_df['meta_clean'] = [f"<b>{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}</b>" for d in doc_df['metadata']]
498
+ doc_df['content_meta'] = doc_df['meta_clean'].astype(str) + ".<br><br>" + doc_df['page_content'].astype(str)
499
+
500
+ modified_page_content = [f" SOURCE {i+1} - {word}" for i, word in enumerate(doc_df['page_content'])]
501
+ docs_content_string = ''.join(modified_page_content)
502
+
503
+ #docs_content_string = '<br><br>\n\n SOURCE '.join(doc_df['page_content'])#.replace(" "," ")#.strip()
504
+ sources_docs_content_string = '<br><br>'.join(doc_df['content_meta'])#.replace(" "," ")#.strip()
505
+ #sources_docs_content_tup = [(sources_docs_content,None)]
506
+ #print("The draft instruction prompt is:")
507
+ #print(instruction_prompt)
508
+
509
+ instruction_prompt_out = instruction_prompt.format(question=new_question_kworded, summaries=docs_content_string)
510
+ #print("The final instruction prompt:")
511
+ #print(instruction_prompt_out)
512
+
513
+
514
+ return instruction_prompt_out, sources_docs_content_string, new_question_kworded
515
+
516
+ def get_history_sources_final_input_prompt(user_input, history, extracted_memory):#):
517
+
518
+ #if chain_agent is None:
519
+ # history.append((user_input, "Please click the button to submit the Huggingface API key before using the chatbot (top right)"))
520
+ # return history, history, "", ""
521
+ print("\n==== date/time: " + str(datetime.datetime.now()) + " ====")
522
+ print("User input: " + user_input)
523
+
524
+ history = history or []
525
+
526
+
527
+
528
+ # Create instruction prompt
529
+ instruction_prompt, content_prompt = create_prompt_templates()
530
+ instruction_prompt_out, docs_content_string, new_question_kworded =\
531
+ create_final_prompt({"question": user_input, "chat_history": history}, #vectorstore,
532
+ instruction_prompt, content_prompt, extracted_memory)
533
+
534
+
535
+ history.append(user_input)
536
+
537
+ print("Output history is:")
538
+ print(history)
539
+
540
+ #print("The output prompt is:")
541
+ #print(instruction_prompt_out)
542
+
543
+ return history, docs_content_string, instruction_prompt_out
544
+
545
+ def highlight_found_text_single(search_text:str, full_text:str, hlt_chunk_size:int=hlt_chunk_size, hlt_strat:List=hlt_strat, hlt_overlap:int=hlt_overlap) -> str:
546
+ """
547
+ Highlights occurrences of search_text within full_text.
548
+
549
+ Parameters:
550
+ - search_text (str): The text to be searched for within full_text.
551
+ - full_text (str): The text within which search_text occurrences will be highlighted.
552
+
553
+ Returns:
554
+ - str: A string with occurrences of search_text highlighted.
555
+
556
+ Example:
557
+ >>> highlight_found_text("world", "Hello, world! This is a test. Another world awaits.")
558
+ 'Hello, <mark style="color:black;">world</mark>! This is a test. Another world awaits.'
559
+ """
560
+
561
+ def extract_text_from_input(text,i=0):
562
+ if isinstance(text, str):
563
+ return text.replace(" ", " ").strip()#.replace("\r", " ").replace("\n", " ")
564
+ elif isinstance(text, list):
565
+ return text[i][0].replace(" ", " ").strip()#.replace("\r", " ").replace("\n", " ")
566
+ else:
567
+ return ""
568
+
569
+ def extract_search_text_from_input(text):
570
+ if isinstance(text, str):
571
+ return text.replace(" ", " ").strip()#.replace("\r", " ").replace("\n", " ").replace(" ", " ").strip()
572
+ elif isinstance(text, list):
573
+ return text[-1][1].replace(" ", " ").strip()#.replace("\r", " ").replace("\n", " ").replace(" ", " ").strip()
574
+ else:
575
+ return ""
576
+
577
+ full_text = extract_text_from_input(full_text)
578
+ search_text = extract_search_text_from_input(search_text)
579
+
580
+ text_splitter = RecursiveCharacterTextSplitter(
581
+ chunk_size=hlt_chunk_size,
582
+ separators=hlt_strat,
583
+ chunk_overlap=hlt_overlap,
584
+ )
585
+ sections = text_splitter.split_text(search_text)
586
+
587
+ #print(sections)
588
+
589
+ found_positions = {}
590
+ for x in sections:
591
+ text_start_pos = full_text.find(x)
592
+
593
+ if text_start_pos != -1:
594
+ found_positions[text_start_pos] = text_start_pos + len(x)
595
+
596
+ # Combine overlapping or adjacent positions
597
+ sorted_starts = sorted(found_positions.keys())
598
+ combined_positions = []
599
+ if sorted_starts:
600
+ current_start, current_end = sorted_starts[0], found_positions[sorted_starts[0]]
601
+ for start in sorted_starts[1:]:
602
+ if start <= (current_end + 1):
603
+ current_end = max(current_end, found_positions[start])
604
+ else:
605
+ combined_positions.append((current_start, current_end))
606
+ current_start, current_end = start, found_positions[start]
607
+ combined_positions.append((current_start, current_end))
608
+
609
+ # Construct pos_tokens
610
+ pos_tokens = []
611
+ prev_end = 0
612
+ for start, end in combined_positions:
613
+ pos_tokens.append(full_text[prev_end:start]) # ((full_text[prev_end:start], None))
614
+ pos_tokens.append('<mark style="color:black;">' + full_text[start:end] + '</mark>')# ("<mark>" + full_text[start:end] + "</mark>",'found')
615
+ prev_end = end
616
+ pos_tokens.append(full_text[prev_end:])
617
+
618
+ return "".join(pos_tokens)
619
+
620
+ def highlight_found_text(search_text: str, full_text: str, hlt_chunk_size:int=hlt_chunk_size, hlt_strat:List=hlt_strat, hlt_overlap:int=hlt_overlap) -> str:
621
+ """
622
+ Highlights occurrences of search_text within full_text.
623
+
624
+ Parameters:
625
+ - search_text (str): The text to be searched for within full_text.
626
+ - full_text (str): The text within which search_text occurrences will be highlighted.
627
+
628
+ Returns:
629
+ - str: A string with occurrences of search_text highlighted.
630
+
631
+ Example:
632
+ >>> highlight_found_text("world", "Hello, world! This is a test. Another world awaits.")
633
+ 'Hello, <mark style="color:black;">world</mark>! This is a test. Another <mark style="color:black;">world</mark> awaits.'
634
+ """
635
+
636
+ def extract_text_from_input(text, i=0):
637
+ if isinstance(text, str):
638
+ return text.replace(" ", " ").strip()
639
+ elif isinstance(text, list):
640
+ return text[i][0].replace(" ", " ").strip()
641
+ else:
642
+ return ""
643
+
644
+ def extract_search_text_from_input(text):
645
+ if isinstance(text, str):
646
+ return text.replace(" ", " ").strip()
647
+ elif isinstance(text, list):
648
+ return text[-1][1].replace(" ", " ").strip()
649
+ else:
650
+ return ""
651
+
652
+ full_text = extract_text_from_input(full_text)
653
+ search_text = extract_search_text_from_input(search_text)
654
+
655
+
656
+
657
+ text_splitter = RecursiveCharacterTextSplitter(
658
+ chunk_size=hlt_chunk_size,
659
+ separators=hlt_strat,
660
+ chunk_overlap=hlt_overlap,
661
+ )
662
+ sections = text_splitter.split_text(search_text)
663
+
664
+ found_positions = {}
665
+ for x in sections:
666
+ text_start_pos = 0
667
+ while text_start_pos != -1:
668
+ text_start_pos = full_text.find(x, text_start_pos)
669
+ if text_start_pos != -1:
670
+ found_positions[text_start_pos] = text_start_pos + len(x)
671
+ text_start_pos += 1
672
+
673
+ # Combine overlapping or adjacent positions
674
+ sorted_starts = sorted(found_positions.keys())
675
+ combined_positions = []
676
+ if sorted_starts:
677
+ current_start, current_end = sorted_starts[0], found_positions[sorted_starts[0]]
678
+ for start in sorted_starts[1:]:
679
+ if start <= (current_end + 1):
680
+ current_end = max(current_end, found_positions[start])
681
+ else:
682
+ combined_positions.append((current_start, current_end))
683
+ current_start, current_end = start, found_positions[start]
684
+ combined_positions.append((current_start, current_end))
685
+
686
+ # Construct pos_tokens
687
+ pos_tokens = []
688
+ prev_end = 0
689
+ for start, end in combined_positions:
690
+ pos_tokens.append(full_text[prev_end:start])
691
+ pos_tokens.append('<mark style="color:black;">' + full_text[start:end] + '</mark>')
692
+ prev_end = end
693
+ pos_tokens.append(full_text[prev_end:])
694
+
695
+ return "".join(pos_tokens)
696
+
697
+ # # Chat functions
698
+ def produce_streaming_answer_chatbot_gpt4all(history, full_prompt):
699
+
700
+ print("The question is: ")
701
+ print(full_prompt)
702
+
703
+ # Pull the generated text from the streamer, and update the model output.
704
+ history[-1][1] = ""
705
+ for new_text in gpt4all_model.generate(full_prompt, max_tokens=2000, streaming=True):
706
+ if new_text == None: new_text = ""
707
+ history[-1][1] += new_text
708
+ yield history
709
+
710
+ def produce_streaming_answer_chatbot_hf(history, full_prompt):
711
+
712
+ #print("The question is: ")
713
+ #print(full_prompt)
714
+
715
+ # Get the model and tokenizer, and tokenize the user text.
716
+ model_inputs = tokenizer(text=full_prompt, return_tensors="pt").to(torch_device)
717
+
718
+ # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
719
+ # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
720
+ streamer = TextIteratorStreamer(tokenizer, timeout=60., skip_prompt=True, skip_special_tokens=True)
721
+ generate_kwargs = dict(
722
+ model_inputs,
723
+ streamer=streamer,
724
+ max_new_tokens=max_new_tokens,
725
+ do_sample=sample,
726
+ repetition_penalty=1.3,
727
+ top_p=top_p,
728
+ temperature=temperature,
729
+ top_k=top_k
730
+ )
731
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
732
+ t.start()
733
+
734
+ # Pull the generated text from the streamer, and update the model output.
735
+ import time
736
+ start = time.time()
737
+ NUM_TOKENS=0
738
+ print('-'*4+'Start Generation'+'-'*4)
739
+
740
+ history[-1][1] = ""
741
+ for new_text in streamer:
742
+ if new_text == None: new_text = ""
743
+ history[-1][1] += new_text
744
+ NUM_TOKENS+=1
745
+ yield history
746
+
747
+ time_generate = time.time() - start
748
+ print('\n')
749
+ print('-'*4+'End Generation'+'-'*4)
750
+ print(f'Num of generated tokens: {NUM_TOKENS}')
751
+ print(f'Time for complete generation: {time_generate}s')
752
+ print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
753
+ print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
754
+
755
+ def produce_streaming_answer_chatbot_ctrans(history, full_prompt):
756
+
757
+ print("The question is: ")
758
+ print(full_prompt)
759
+
760
+ #tokens = ctrans_llm.tokenize(full_prompt)
761
+
762
+ #import psutil
763
+ #from loguru import logger
764
+
765
+ #_ = [elm for elm in full_prompt.splitlines() if elm.strip()]
766
+ #stop_string = [elm.split(":")[0] + ":" for elm in _][-2]
767
+ #print(stop_string)
768
+
769
+ #logger.debug(f"{stop_string=} not used")
770
+
771
+ #_ = psutil.cpu_count(logical=False) - 1
772
+ #cpu_count: int = int(_) if _ else 1
773
+ #logger.debug(f"{cpu_count=}")
774
+
775
+ # Pull the generated text from the streamer, and update the model output.
776
+ config = GenerationConfig(reset=True)
777
+ history[-1][1] = ""
778
+ for new_text in ctrans_generate(prompt=full_prompt, config=config):
779
+ if new_text == None: new_text = ""
780
+ history[-1][1] += new_text
781
+ yield history
782
+
783
+ @dataclass
784
+ class GenerationConfig:
785
+ temperature: float = temperature
786
+ top_k: int = top_k
787
+ top_p: float = top_p
788
+ repetition_penalty: float = repetition_penalty
789
+ last_n_tokens: int = last_n_tokens
790
+ max_new_tokens: int = max_new_tokens
791
+ #seed: int = 42
792
+ reset: bool = reset
793
+ stream: bool = stream
794
+ threads: int = threads
795
+ batch_size:int = batch_size
796
+ #context_length:int = context_length
797
+ #gpu_layers:int = gpu_layers
798
+ #stop: list[str] = field(default_factory=lambda: [stop_string])
799
+
800
+ def ctrans_generate(
801
+ prompt: str,
802
+ llm=ctrans_llm,
803
+ config: GenerationConfig = GenerationConfig(),
804
+ ):
805
+ """Run model inference, will return a Generator if streaming is true."""
806
+
807
+ return llm(
808
+ prompt,
809
+ **asdict(config),
810
+ )
811
+
812
+ def turn_off_interactivity(user_message, history):
813
+ return gr.update(value="", interactive=False), history + [[user_message, None]]
814
+
815
+ # # Chat history functions
816
+
817
+ def clear_chat(chat_history_state, sources, chat_message, current_topic):
818
+ chat_history_state = []
819
+ sources = ''
820
+ chat_message = ''
821
+ current_topic = ''
822
+
823
+ return chat_history_state, sources, chat_message, current_topic
824
+
825
+ def _get_chat_history(chat_history: List[Tuple[str, str]], max_chat_length:int = 20): # Limit to last x interactions only
826
+
827
+ if not chat_history:
828
+ chat_history = []
829
+
830
+ if len(chat_history) > max_chat_length:
831
+ chat_history = chat_history[-max_chat_length:]
832
+
833
+ #print(chat_history)
834
+
835
+ first_q = ""
836
+ first_ans = ""
837
+ for human_s, ai_s in chat_history:
838
+ first_q = human_s
839
+ first_ans = ai_s
840
+
841
+ #print("Text to keyword extract: " + first_q + " " + first_ans)
842
+ break
843
+
844
+ conversation = ""
845
+ for human_s, ai_s in chat_history:
846
+ human = f"Human: " + human_s
847
+ ai = f"Assistant: " + ai_s
848
+ conversation += "\n" + "\n".join([human, ai])
849
+
850
+ return conversation, first_q, first_ans, max_chat_length
851
+
852
+ def add_inputs_answer_to_history(user_message, history, current_topic):
853
+
854
+ #history.append((user_message, [-1]))
855
+
856
+ chat_history_str, chat_history_first_q, chat_history_first_ans, max_chat_length = _get_chat_history(history)
857
+
858
+
859
+ # Only get the keywords for the first question and response, or do it every time if over 'max_chat_length' responses in the conversation
860
+ if (len(history) == 1) | (len(history) > max_chat_length):
861
+
862
+ #print("History after appending is:")
863
+ #print(history)
864
+
865
+ first_q_and_first_ans = str(chat_history_first_q) + " " + str(chat_history_first_ans)
866
+ #ner_memory = remove_q_ner_extractor(first_q_and_first_ans)
867
+ keywords = keybert_keywords(first_q_and_first_ans, n = 8, kw_model=kw_model)
868
+ #keywords.append(ner_memory)
869
+
870
+ # Remove duplicate words while preserving order
871
+ ordered_tokens = set()
872
+ result = []
873
+ for word in keywords:
874
+ if word not in ordered_tokens:
875
+ ordered_tokens.add(word)
876
+ result.append(word)
877
+
878
+ extracted_memory = ' '.join(result)
879
+
880
+ else: extracted_memory=current_topic
881
+
882
+ print("Extracted memory is:")
883
+ print(extracted_memory)
884
+
885
+
886
+ return history, extracted_memory
887
+
888
+ def remove_q_stopwords(question): # Remove stopwords from question. Not used at the moment
889
+ # Prepare keywords from question by removing stopwords
890
+ text = question.lower()
891
+
892
+ # Remove numbers
893
+ text = re.sub('[0-9]', '', text)
894
+
895
+ tokenizer = RegexpTokenizer(r'\w+')
896
+ text_tokens = tokenizer.tokenize(text)
897
+ #text_tokens = word_tokenize(text)
898
+ tokens_without_sw = [word for word in text_tokens if not word in stopwords]
899
+
900
+ # Remove duplicate words while preserving order
901
+ ordered_tokens = set()
902
+ result = []
903
+ for word in tokens_without_sw:
904
+ if word not in ordered_tokens:
905
+ ordered_tokens.add(word)
906
+ result.append(word)
907
+
908
+
909
+
910
+ new_question_keywords = ' '.join(result)
911
+ return new_question_keywords
912
+
913
+ def remove_q_ner_extractor(question):
914
+
915
+ predict_out = ner_model.predict(question)
916
+
917
+
918
+
919
+ predict_tokens = [' '.join(v for k, v in d.items() if k == 'span') for d in predict_out]
920
+
921
+ # Remove duplicate words while preserving order
922
+ ordered_tokens = set()
923
+ result = []
924
+ for word in predict_tokens:
925
+ if word not in ordered_tokens:
926
+ ordered_tokens.add(word)
927
+ result.append(word)
928
+
929
+
930
+
931
+ new_question_keywords = ' '.join(result).lower()
932
+ return new_question_keywords
933
+
934
+ def apply_lemmatize(text, wnl=WordNetLemmatizer()):
935
+
936
+ def prep_for_lemma(text):
937
+
938
+ # Remove numbers
939
+ text = re.sub('[0-9]', '', text)
940
+ print(text)
941
+
942
+ tokenizer = RegexpTokenizer(r'\w+')
943
+ text_tokens = tokenizer.tokenize(text)
944
+ #text_tokens = word_tokenize(text)
945
+
946
+ return text_tokens
947
+
948
+ tokens = prep_for_lemma(text)
949
+
950
+ def lem_word(word):
951
+
952
+ if len(word) > 3: out_word = wnl.lemmatize(word)
953
+ else: out_word = word
954
+
955
+ return out_word
956
+
957
+ return [lem_word(token) for token in tokens]
958
+
959
+ def keybert_keywords(text, n, kw_model):
960
+ tokens_lemma = apply_lemmatize(text)
961
+ lemmatised_text = ' '.join(tokens_lemma)
962
+
963
+ keywords_text = keybert.KeyBERT(model=kw_model).extract_keywords(lemmatised_text, stop_words='english', top_n=n,
964
+ keyphrase_ngram_range=(1, 1))
965
+ keywords_list = [item[0] for item in keywords_text]
966
+
967
+ return keywords_list
968
+
chatfuncs/ingest.py ADDED
@@ -0,0 +1,522 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ---
2
+ # jupyter:
3
+ # jupytext:
4
+ # formats: ipynb,py:light
5
+ # text_representation:
6
+ # extension: .py
7
+ # format_name: light
8
+ # format_version: '1.5'
9
+ # jupytext_version: 1.14.6
10
+ # kernelspec:
11
+ # display_name: Python 3 (ipykernel)
12
+ # language: python
13
+ # name: python3
14
+ # ---
15
+
16
+ # # Ingest website to FAISS
17
+
18
+ # ## Install/ import stuff we need
19
+
20
+ import os
21
+ from pathlib import Path
22
+ import re
23
+ import requests
24
+ import pandas as pd
25
+ import dateutil.parser
26
+ from typing import TypeVar, List
27
+
28
+ from langchain.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings
29
+ from langchain.vectorstores.faiss import FAISS
30
+ from langchain.vectorstores import Chroma
31
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
32
+ from langchain.docstore.document import Document
33
+
34
+ from bs4 import BeautifulSoup
35
+ from docx import Document as Doc
36
+ from pypdf import PdfReader
37
+
38
+ PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
39
+ # -
40
+
41
+ split_strat = ["\n\n", "\n", ".", "!", "?", ","]
42
+ chunk_size = 500
43
+ chunk_overlap = 0
44
+ start_index = True
45
+
46
+ ## Parse files
47
+
48
+ def parse_file(file_paths, div:str = "p"):
49
+ """
50
+ Accepts a list of file paths, determines each file's type based on its extension,
51
+ and passes it to the relevant parsing function.
52
+
53
+ Parameters:
54
+ file_paths (list): List of file paths.
55
+ div (str): (optional) Div to pull out of html file/url with BeautifulSoup
56
+
57
+ Returns:
58
+ dict: A dictionary with file paths as keys and their parsed content (or error message) as values.
59
+ """
60
+
61
+ def determine_file_type(file_path):
62
+ """
63
+ Determine the file type based on its extension.
64
+
65
+ Parameters:
66
+ file_path (str): Path to the file.
67
+
68
+ Returns:
69
+ str: File extension (e.g., '.pdf', '.docx', '.txt', '.html').
70
+ """
71
+ return os.path.splitext(file_path)[1].lower()
72
+
73
+ if not isinstance(file_paths, list):
74
+ raise ValueError("Expected a list of file paths.")
75
+
76
+ extension_to_parser = {
77
+ '.pdf': parse_pdf,
78
+ '.docx': parse_docx,
79
+ '.txt': parse_txt,
80
+ '.html': parse_html,
81
+ '.htm': parse_html # Considering both .html and .htm for HTML files
82
+ }
83
+
84
+ parsed_contents = {}
85
+
86
+ for file_path in file_paths:
87
+ print(file_path.name)
88
+ #file = open(file_path.name, 'r')
89
+ #print(file)
90
+ file_extension = determine_file_type(file_path.name)
91
+ if file_extension in extension_to_parser:
92
+ parsed_contents[file_path.name] = extension_to_parser[file_extension](file_path.name)
93
+ else:
94
+ parsed_contents[file_path.name] = f"Unsupported file type: {file_extension}"
95
+
96
+ return parsed_contents
97
+
98
+ def text_regex_clean(text):
99
+ # Merge hyphenated words
100
+ text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
101
+ # If a double newline ends in a letter, add a full stop.
102
+ text = re.sub(r'(?<=[a-zA-Z])\n\n', '.\n\n', text)
103
+ # Fix newlines in the middle of sentences
104
+ text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
105
+ # Remove multiple newlines
106
+ text = re.sub(r"\n\s*\n", "\n\n", text)
107
+ text = re.sub(r" ", " ", text)
108
+ # Add full stops and new lines between words with no space between where the second one has a capital letter
109
+ text = re.sub(r'(?<=[a-z])(?=[A-Z])', '. \n\n', text)
110
+
111
+ return text
112
+
113
+ def parse_pdf(file) -> List[str]:
114
+
115
+ """
116
+ Extract text from a PDF file.
117
+
118
+ Parameters:
119
+ file_path (str): Path to the PDF file.
120
+
121
+ Returns:
122
+ List[str]: Extracted text from the PDF.
123
+ """
124
+
125
+ output = []
126
+ #for file in files:
127
+ print(file) # .name
128
+ pdf = PdfReader(file) #[i] .name[i]
129
+
130
+ for page in pdf.pages:
131
+ text = page.extract_text()
132
+
133
+ text = text_regex_clean(text)
134
+
135
+ output.append(text)
136
+ return output
137
+
138
+ def parse_docx(file_path):
139
+ """
140
+ Reads the content of a .docx file and returns it as a string.
141
+
142
+ Parameters:
143
+ - file_path (str): Path to the .docx file.
144
+
145
+ Returns:
146
+ - str: Content of the .docx file.
147
+ """
148
+ doc = Doc(file_path)
149
+ full_text = []
150
+ for para in doc.paragraphs:
151
+ para = text_regex_clean(para)
152
+
153
+ full_text.append(para.text.replace(" ", " ").strip())
154
+ return '\n'.join(full_text)
155
+
156
+ def parse_txt(file_path):
157
+ """
158
+ Read text from a TXT or HTML file.
159
+
160
+ Parameters:
161
+ file_path (str): Path to the TXT or HTML file.
162
+
163
+ Returns:
164
+ str: Text content of the file.
165
+ """
166
+ with open(file_path, 'r', encoding="utf-8") as file:
167
+ file_contents = file.read().replace(" ", " ").strip()
168
+
169
+ file_contents = text_regex_clean(file_contents)
170
+
171
+ return file_contents
172
+
173
+ def parse_html(page_url, div_filter="p"):
174
+ """
175
+ Determine if the source is a web URL or a local HTML file, extract the content based on the div of choice. Also tries to extract dates (WIP)
176
+
177
+ Parameters:
178
+ page_url (str): The web URL or local file path.
179
+
180
+ Returns:
181
+ str: Extracted content.
182
+ """
183
+
184
+ def is_web_url(s):
185
+ """
186
+ Check if the input string is a web URL.
187
+ """
188
+ return s.startswith("http://") or s.startswith("https://")
189
+
190
+ def is_local_html_file(s):
191
+ """
192
+ Check if the input string is a path to a local HTML file.
193
+ """
194
+ return (s.endswith(".html") or s.endswith(".htm")) and os.path.isfile(s)
195
+
196
+ def extract_text_from_source(source):
197
+ """
198
+ Determine if the source is a web URL or a local HTML file,
199
+ and then extract its content accordingly.
200
+
201
+ Parameters:
202
+ source (str): The web URL or local file path.
203
+
204
+ Returns:
205
+ str: Extracted content.
206
+ """
207
+ if is_web_url(source):
208
+ response = requests.get(source)
209
+ response.raise_for_status() # Raise an HTTPError for bad responses
210
+ return response.text.replace(" ", " ").strip()
211
+ elif is_local_html_file(source):
212
+ with open(source, 'r', encoding='utf-8') as file:
213
+ file_out = file.read().replace
214
+ return file_out
215
+ else:
216
+ raise ValueError("Input is neither a valid web URL nor a local HTML file path.")
217
+
218
+
219
+ def clean_html_data(data, date_filter="", div_filt="p"):
220
+ """
221
+ Extracts and cleans data from HTML content.
222
+
223
+ Parameters:
224
+ data (str): HTML content to be parsed.
225
+ date_filter (str, optional): Date string to filter results. If set, only content with a date greater than this will be returned.
226
+ div_filt (str, optional): HTML tag to search for text content. Defaults to "p".
227
+
228
+ Returns:
229
+ tuple: Contains extracted text and date as strings. Returns empty strings if not found.
230
+ """
231
+
232
+ soup = BeautifulSoup(data, 'html.parser')
233
+
234
+ # Function to exclude div with id "bar"
235
+ def exclude_div_with_id_bar(tag):
236
+ return tag.has_attr('id') and tag['id'] == 'related-links'
237
+
238
+ text_elements = soup.find_all(div_filt)
239
+ date_elements = soup.find_all(div_filt, {"class": "page-neutral-intro__meta"})
240
+
241
+ # Extract date
242
+ date_out = ""
243
+ if date_elements:
244
+ date_out = re.search(">(.*?)<", str(date_elements[0])).group(1)
245
+ date_dt = dateutil.parser.parse(date_out)
246
+
247
+ if date_filter:
248
+ date_filter_dt = dateutil.parser.parse(date_filter)
249
+ if date_dt < date_filter_dt:
250
+ return '', date_out
251
+
252
+ # Extract text
253
+ text_out_final = ""
254
+ if text_elements:
255
+ text_out_final = '\n'.join(paragraph.text for paragraph in text_elements)
256
+ text_out_final = text_regex_clean(text_out_final)
257
+ else:
258
+ print(f"No elements found with tag '{div_filt}'. No text returned.")
259
+
260
+ return text_out_final, date_out
261
+
262
+
263
+ #page_url = "https://pypi.org/project/InstructorEmbedding/" #'https://www.ons.gov.uk/visualisations/censusareachanges/E09000022/index.html'
264
+
265
+ html_text = extract_text_from_source(page_url)
266
+ #print(page.text)
267
+
268
+ texts = []
269
+ metadatas = []
270
+
271
+ clean_text, date = clean_html_data(html_text, date_filter="", div_filt=div_filter)
272
+ texts.append(clean_text)
273
+ metadatas.append({"source": page_url, "date":str(date)})
274
+
275
+ return texts, metadatas
276
+
277
+ # +
278
+ # Convert parsed text to docs
279
+ # -
280
+
281
+ def text_to_docs(text_dict: dict, chunk_size: int = chunk_size) -> List[Document]:
282
+ """
283
+ Converts the output of parse_file (a dictionary of file paths to content)
284
+ to a list of Documents with metadata.
285
+ """
286
+
287
+ doc_sections = []
288
+ parent_doc_sections = []
289
+
290
+ for file_path, content in text_dict.items():
291
+ ext = os.path.splitext(file_path)[1].lower()
292
+
293
+ # Depending on the file extension, handle the content
294
+ if ext == '.pdf':
295
+ docs, page_docs = pdf_text_to_docs(content, chunk_size)
296
+ elif ext in ['.html', '.htm', '.txt', '.docx']:
297
+ # Assuming you want to process HTML similarly to PDF in this context
298
+ docs = html_text_to_docs(content, chunk_size)
299
+ else:
300
+ print(f"Unsupported file type {ext} for {file_path}. Skipping.")
301
+ continue
302
+
303
+
304
+ match = re.search(r'.*[\/\\](.+)$', file_path)
305
+ filename_end = match.group(1)
306
+
307
+ # Add filename as metadata
308
+ for doc in docs: doc.metadata["source"] = filename_end
309
+ #for parent_doc in parent_docs: parent_doc.metadata["source"] = filename_end
310
+
311
+ doc_sections.extend(docs)
312
+ #parent_doc_sections.extend(parent_docs)
313
+
314
+ return doc_sections, page_docs
315
+
316
+ def pdf_text_to_docs(text, chunk_size: int = chunk_size) -> List[Document]:
317
+ """Converts a string or list of strings to a list of Documents
318
+ with metadata."""
319
+
320
+ #print(text)
321
+
322
+ if isinstance(text, str):
323
+ # Take a single string as one page
324
+ text = [text]
325
+
326
+ page_docs = [Document(page_content=page, metadata={"page": page}) for page in text]
327
+
328
+
329
+ # Add page numbers as metadata
330
+ for i, doc in enumerate(page_docs):
331
+ doc.metadata["page"] = i + 1
332
+
333
+ print("page docs are: ")
334
+ print(page_docs)
335
+
336
+ # Split pages into sections
337
+ doc_sections = []
338
+
339
+ for doc in page_docs:
340
+
341
+ #print("page content: ")
342
+ #print(doc.page_content)
343
+
344
+ if doc.page_content == '':
345
+ sections = ['']
346
+
347
+ else:
348
+ text_splitter = RecursiveCharacterTextSplitter(
349
+ chunk_size=chunk_size,
350
+ separators=split_strat,#["\n\n", "\n", ".", "!", "?", ",", " ", ""],
351
+ chunk_overlap=chunk_overlap,
352
+ add_start_index=True
353
+ )
354
+ sections = text_splitter.split_text(doc.page_content)
355
+
356
+ for i, section in enumerate(sections):
357
+ doc = Document(
358
+ page_content=section, metadata={"page": doc.metadata["page"], "section": i, "page_section": f"{doc.metadata['page']}-{i}"})
359
+
360
+
361
+ doc_sections.append(doc)
362
+
363
+ return doc_sections, page_docs#, parent_doc
364
+
365
+ def html_text_to_docs(texts, metadatas, chunk_size:int = chunk_size):
366
+
367
+ text_splitter = RecursiveCharacterTextSplitter(
368
+ separators=split_strat,#["\n\n", "\n", ".", "!", "?", ",", " ", ""],
369
+ chunk_size=chunk_size,
370
+ chunk_overlap=chunk_overlap,
371
+ length_function=len,
372
+ add_start_index=True
373
+ )
374
+
375
+ #print(texts)
376
+ #print(metadatas)
377
+
378
+ documents = text_splitter.create_documents(texts, metadatas=metadatas)
379
+
380
+ for i, section in enumerate(documents):
381
+ section.metadata["section"] = i + 1
382
+
383
+ return documents
384
+
385
+ # # Functions for working with documents after loading them back in
386
+
387
+ def pull_out_data(series):
388
+
389
+ # define a lambda function to convert each string into a tuple
390
+ to_tuple = lambda x: eval(x)
391
+
392
+ # apply the lambda function to each element of the series
393
+ series_tup = series.apply(to_tuple)
394
+
395
+ series_tup_content = list(zip(*series_tup))[1]
396
+
397
+ series = pd.Series(list(series_tup_content))#.str.replace("^Main post content", "", regex=True).str.strip()
398
+
399
+ return series
400
+
401
+ def docs_from_csv(df):
402
+
403
+ import ast
404
+
405
+ documents = []
406
+
407
+ page_content = pull_out_data(df["0"])
408
+ metadatas = pull_out_data(df["1"])
409
+
410
+ for x in range(0,len(df)):
411
+ new_doc = Document(page_content=page_content[x], metadata=metadatas[x])
412
+ documents.append(new_doc)
413
+
414
+ return documents
415
+
416
+ def docs_from_lists(docs, metadatas):
417
+
418
+ documents = []
419
+
420
+ for x, doc in enumerate(docs):
421
+ new_doc = Document(page_content=doc, metadata=metadatas[x])
422
+ documents.append(new_doc)
423
+
424
+ return documents
425
+
426
+ def docs_elements_from_csv_save(docs_path="documents.csv"):
427
+
428
+ documents = pd.read_csv(docs_path)
429
+
430
+ docs_out = docs_from_csv(documents)
431
+
432
+ out_df = pd.DataFrame(docs_out)
433
+
434
+ docs_content = pull_out_data(out_df[0].astype(str))
435
+
436
+ docs_meta = pull_out_data(out_df[1].astype(str))
437
+
438
+ doc_sources = [d['source'] for d in docs_meta]
439
+
440
+ return out_df, docs_content, docs_meta, doc_sources
441
+
442
+ # ## Create embeddings and save faiss vector store to the path specified in `save_to`
443
+
444
+ def load_embeddings(model_name = "thenlper/gte-base"):
445
+
446
+ if model_name == "hkunlp/instructor-large":
447
+ embeddings_func = HuggingFaceInstructEmbeddings(model_name=model_name,
448
+ embed_instruction="Represent the paragraph for retrieval: ",
449
+ query_instruction="Represent the question for retrieving supporting documents: "
450
+ )
451
+
452
+ else:
453
+ embeddings_func = HuggingFaceEmbeddings(model_name=model_name)
454
+
455
+ global embeddings
456
+
457
+ embeddings = embeddings_func
458
+
459
+ #return embeddings_func
460
+
461
+ def embed_faiss_save_to_zip(docs_out, save_to="faiss_lambeth_census_embedding", model_name = "thenlper/gte-base"):
462
+
463
+ load_embeddings(model_name=model_name)
464
+
465
+ #embeddings_fast = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
466
+
467
+ print(f"> Total split documents: {len(docs_out)}")
468
+
469
+ vectorstore = FAISS.from_documents(documents=docs_out, embedding=embeddings)
470
+
471
+
472
+ if Path(save_to).exists():
473
+ vectorstore.save_local(folder_path=save_to)
474
+
475
+ print("> DONE")
476
+ print(f"> Saved to: {save_to}")
477
+
478
+ ### Save as zip, then remove faiss/pkl files to allow for upload to huggingface
479
+
480
+ import shutil
481
+
482
+ shutil.make_archive(save_to, 'zip', save_to)
483
+
484
+ os.remove(save_to + "/index.faiss")
485
+ os.remove(save_to + "/index.pkl")
486
+
487
+ shutil.move(save_to + '.zip', save_to + "/" + save_to + '.zip')
488
+
489
+ return vectorstore
490
+
491
+ def docs_to_chroma_save(embeddings, docs_out:PandasDataFrame, save_to:str):
492
+ print(f"> Total split documents: {len(docs_out)}")
493
+
494
+ vectordb = Chroma.from_documents(documents=docs_out,
495
+ embedding=embeddings,
496
+ persist_directory=save_to)
497
+
498
+ # persiste the db to disk
499
+ vectordb.persist()
500
+
501
+ print("> DONE")
502
+ print(f"> Saved to: {save_to}")
503
+
504
+ return vectordb
505
+
506
+ def sim_search_local_saved_vec(query, k_val, save_to="faiss_lambeth_census_embedding"):
507
+
508
+ load_embeddings()
509
+
510
+ docsearch = FAISS.load_local(folder_path=save_to, embeddings=embeddings)
511
+
512
+
513
+ display(Markdown(question))
514
+
515
+ search = docsearch.similarity_search_with_score(query, k=k_val)
516
+
517
+ for item in search:
518
+ print(item[0].page_content)
519
+ print(f"Page: {item[0].metadata['source']}")
520
+ print(f"Date: {item[0].metadata['date']}")
521
+ print(f"Score: {item[1]}")
522
+ print("---")
chatfuncs/ingest_borough_plan.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ingest as ing
2
+ import pandas as pd
3
+
4
+ borough_plan_text = ing.parse_file([open("faiss_embedding/Lambeth_2030-Our_Future_Our_Lambeth.pdf")])
5
+ print("Borough plan text created")
6
+
7
+ #print(borough_plan_text)
8
+
9
+ borough_plan_docs, borough_plan_page_docs = ing.text_to_docs(borough_plan_text)
10
+ print("Borough plan docs created")
11
+
12
+ embedding_model = "thenlper/gte-base"
13
+
14
+ ing.load_embeddings(model_name = embedding_model)
15
+ ing.embed_faiss_save_to_zip(borough_plan_docs, save_to="faiss_embedding", model_name = embedding_model)
16
+ #borough_plan_parent_docs.to_csv("borough_plan_parent_docs.csv", index=False)
faiss_embedding/faiss_embedding.zip ADDED
Binary file (441 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ beautifulsoup4
3
+ pandas
4
+ black
5
+ isort
6
+ Flask
7
+ transformers
8
+ --extra-index-url https://download.pytorch.org/whl/cu113
9
+ torch
10
+ sentence_transformers
11
+ faiss-cpu
12
+ bitsandbytes
13
+ accelerate
14
+ optimum
15
+ pypdf
16
+ gradio
17
+ gradio_client==0.2.7
test/__init__.py ADDED
File without changes
test/sample.docx ADDED
Binary file (12 kB). View file
 
test/sample.html ADDED
@@ -0,0 +1,769 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <html xmlns:v="urn:schemas-microsoft-com:vml"
2
+ xmlns:o="urn:schemas-microsoft-com:office:office"
3
+ xmlns:w="urn:schemas-microsoft-com:office:word"
4
+ xmlns:m="http://schemas.microsoft.com/office/2004/12/omml"
5
+ xmlns="http://www.w3.org/TR/REC-html40">
6
+
7
+ <head>
8
+ <meta http-equiv=Content-Type content="text/html; charset=windows-1252">
9
+ <meta name=ProgId content=Word.Document>
10
+ <meta name=Generator content="Microsoft Word 15">
11
+ <meta name=Originator content="Microsoft Word 15">
12
+ <link rel=File-List href="sample_files/filelist.xml">
13
+ <!--[if gte mso 9]><xml>
14
+ <o:DocumentProperties>
15
+ <o:Author>Sean Pedrick-Case</o:Author>
16
+ <o:LastAuthor>Sean Pedrick-Case</o:LastAuthor>
17
+ <o:Revision>2</o:Revision>
18
+ <o:TotalTime>1</o:TotalTime>
19
+ <o:Created>2023-08-07T09:40:00Z</o:Created>
20
+ <o:LastSaved>2023-08-07T09:40:00Z</o:LastSaved>
21
+ <o:Pages>1</o:Pages>
22
+ <o:Words>2</o:Words>
23
+ <o:Characters>12</o:Characters>
24
+ <o:Lines>1</o:Lines>
25
+ <o:Paragraphs>1</o:Paragraphs>
26
+ <o:CharactersWithSpaces>13</o:CharactersWithSpaces>
27
+ <o:Version>16.00</o:Version>
28
+ </o:DocumentProperties>
29
+ <o:OfficeDocumentSettings>
30
+ <o:AllowPNG/>
31
+ </o:OfficeDocumentSettings>
32
+ </xml><![endif]-->
33
+ <link rel=themeData href="sample_files/themedata.thmx">
34
+ <link rel=colorSchemeMapping href="sample_files/colorschememapping.xml">
35
+ <!--[if gte mso 9]><xml>
36
+ <w:WordDocument>
37
+ <w:SpellingState>Clean</w:SpellingState>
38
+ <w:GrammarState>Clean</w:GrammarState>
39
+ <w:TrackMoves/>
40
+ <w:TrackFormatting/>
41
+ <w:PunctuationKerning/>
42
+ <w:ValidateAgainstSchemas/>
43
+ <w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
44
+ <w:IgnoreMixedContent>false</w:IgnoreMixedContent>
45
+ <w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
46
+ <w:DoNotPromoteQF/>
47
+ <w:LidThemeOther>EN-GB</w:LidThemeOther>
48
+ <w:LidThemeAsian>X-NONE</w:LidThemeAsian>
49
+ <w:LidThemeComplexScript>X-NONE</w:LidThemeComplexScript>
50
+ <w:Compatibility>
51
+ <w:BreakWrappedTables/>
52
+ <w:SnapToGridInCell/>
53
+ <w:WrapTextWithPunct/>
54
+ <w:UseAsianBreakRules/>
55
+ <w:DontGrowAutofit/>
56
+ <w:SplitPgBreakAndParaMark/>
57
+ <w:EnableOpenTypeKerning/>
58
+ <w:DontFlipMirrorIndents/>
59
+ <w:OverrideTableStyleHps/>
60
+ </w:Compatibility>
61
+ <m:mathPr>
62
+ <m:mathFont m:val="Cambria Math"/>
63
+ <m:brkBin m:val="before"/>
64
+ <m:brkBinSub m:val="&#45;-"/>
65
+ <m:smallFrac m:val="off"/>
66
+ <m:dispDef/>
67
+ <m:lMargin m:val="0"/>
68
+ <m:rMargin m:val="0"/>
69
+ <m:defJc m:val="centerGroup"/>
70
+ <m:wrapIndent m:val="1440"/>
71
+ <m:intLim m:val="subSup"/>
72
+ <m:naryLim m:val="undOvr"/>
73
+ </m:mathPr></w:WordDocument>
74
+ </xml><![endif]--><!--[if gte mso 9]><xml>
75
+ <w:LatentStyles DefLockedState="false" DefUnhideWhenUsed="false"
76
+ DefSemiHidden="false" DefQFormat="false" DefPriority="99"
77
+ LatentStyleCount="376">
78
+ <w:LsdException Locked="false" Priority="0" QFormat="true" Name="Normal"/>
79
+ <w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 1"/>
80
+ <w:LsdException Locked="false" Priority="9" SemiHidden="true"
81
+ UnhideWhenUsed="true" QFormat="true" Name="heading 2"/>
82
+ <w:LsdException Locked="false" Priority="9" SemiHidden="true"
83
+ UnhideWhenUsed="true" QFormat="true" Name="heading 3"/>
84
+ <w:LsdException Locked="false" Priority="9" SemiHidden="true"
85
+ UnhideWhenUsed="true" QFormat="true" Name="heading 4"/>
86
+ <w:LsdException Locked="false" Priority="9" SemiHidden="true"
87
+ UnhideWhenUsed="true" QFormat="true" Name="heading 5"/>
88
+ <w:LsdException Locked="false" Priority="9" SemiHidden="true"
89
+ UnhideWhenUsed="true" QFormat="true" Name="heading 6"/>
90
+ <w:LsdException Locked="false" Priority="9" SemiHidden="true"
91
+ UnhideWhenUsed="true" QFormat="true" Name="heading 7"/>
92
+ <w:LsdException Locked="false" Priority="9" SemiHidden="true"
93
+ UnhideWhenUsed="true" QFormat="true" Name="heading 8"/>
94
+ <w:LsdException Locked="false" Priority="9" SemiHidden="true"
95
+ UnhideWhenUsed="true" QFormat="true" Name="heading 9"/>
96
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
97
+ Name="index 1"/>
98
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
99
+ Name="index 2"/>
100
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
101
+ Name="index 3"/>
102
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
103
+ Name="index 4"/>
104
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
105
+ Name="index 5"/>
106
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
107
+ Name="index 6"/>
108
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
109
+ Name="index 7"/>
110
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
111
+ Name="index 8"/>
112
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
113
+ Name="index 9"/>
114
+ <w:LsdException Locked="false" Priority="39" SemiHidden="true"
115
+ UnhideWhenUsed="true" Name="toc 1"/>
116
+ <w:LsdException Locked="false" Priority="39" SemiHidden="true"
117
+ UnhideWhenUsed="true" Name="toc 2"/>
118
+ <w:LsdException Locked="false" Priority="39" SemiHidden="true"
119
+ UnhideWhenUsed="true" Name="toc 3"/>
120
+ <w:LsdException Locked="false" Priority="39" SemiHidden="true"
121
+ UnhideWhenUsed="true" Name="toc 4"/>
122
+ <w:LsdException Locked="false" Priority="39" SemiHidden="true"
123
+ UnhideWhenUsed="true" Name="toc 5"/>
124
+ <w:LsdException Locked="false" Priority="39" SemiHidden="true"
125
+ UnhideWhenUsed="true" Name="toc 6"/>
126
+ <w:LsdException Locked="false" Priority="39" SemiHidden="true"
127
+ UnhideWhenUsed="true" Name="toc 7"/>
128
+ <w:LsdException Locked="false" Priority="39" SemiHidden="true"
129
+ UnhideWhenUsed="true" Name="toc 8"/>
130
+ <w:LsdException Locked="false" Priority="39" SemiHidden="true"
131
+ UnhideWhenUsed="true" Name="toc 9"/>
132
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
133
+ Name="Normal Indent"/>
134
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
135
+ Name="footnote text"/>
136
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
137
+ Name="annotation text"/>
138
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
139
+ Name="header"/>
140
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
141
+ Name="footer"/>
142
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
143
+ Name="index heading"/>
144
+ <w:LsdException Locked="false" Priority="35" SemiHidden="true"
145
+ UnhideWhenUsed="true" QFormat="true" Name="caption"/>
146
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
147
+ Name="table of figures"/>
148
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
149
+ Name="envelope address"/>
150
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
151
+ Name="envelope return"/>
152
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
153
+ Name="footnote reference"/>
154
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
155
+ Name="annotation reference"/>
156
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
157
+ Name="line number"/>
158
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
159
+ Name="page number"/>
160
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
161
+ Name="endnote reference"/>
162
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
163
+ Name="endnote text"/>
164
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
165
+ Name="table of authorities"/>
166
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
167
+ Name="macro"/>
168
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
169
+ Name="toa heading"/>
170
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
171
+ Name="List"/>
172
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
173
+ Name="List Bullet"/>
174
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
175
+ Name="List Number"/>
176
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
177
+ Name="List 2"/>
178
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
179
+ Name="List 3"/>
180
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
181
+ Name="List 4"/>
182
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
183
+ Name="List 5"/>
184
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
185
+ Name="List Bullet 2"/>
186
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
187
+ Name="List Bullet 3"/>
188
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
189
+ Name="List Bullet 4"/>
190
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
191
+ Name="List Bullet 5"/>
192
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
193
+ Name="List Number 2"/>
194
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
195
+ Name="List Number 3"/>
196
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
197
+ Name="List Number 4"/>
198
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
199
+ Name="List Number 5"/>
200
+ <w:LsdException Locked="false" Priority="10" QFormat="true" Name="Title"/>
201
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
202
+ Name="Closing"/>
203
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
204
+ Name="Signature"/>
205
+ <w:LsdException Locked="false" Priority="1" SemiHidden="true"
206
+ UnhideWhenUsed="true" Name="Default Paragraph Font"/>
207
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
208
+ Name="Body Text"/>
209
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
210
+ Name="Body Text Indent"/>
211
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
212
+ Name="List Continue"/>
213
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
214
+ Name="List Continue 2"/>
215
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
216
+ Name="List Continue 3"/>
217
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
218
+ Name="List Continue 4"/>
219
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
220
+ Name="List Continue 5"/>
221
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
222
+ Name="Message Header"/>
223
+ <w:LsdException Locked="false" Priority="11" QFormat="true" Name="Subtitle"/>
224
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
225
+ Name="Salutation"/>
226
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
227
+ Name="Date"/>
228
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
229
+ Name="Body Text First Indent"/>
230
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
231
+ Name="Body Text First Indent 2"/>
232
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
233
+ Name="Note Heading"/>
234
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
235
+ Name="Body Text 2"/>
236
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
237
+ Name="Body Text 3"/>
238
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
239
+ Name="Body Text Indent 2"/>
240
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
241
+ Name="Body Text Indent 3"/>
242
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
243
+ Name="Block Text"/>
244
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
245
+ Name="Hyperlink"/>
246
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
247
+ Name="FollowedHyperlink"/>
248
+ <w:LsdException Locked="false" Priority="22" QFormat="true" Name="Strong"/>
249
+ <w:LsdException Locked="false" Priority="20" QFormat="true" Name="Emphasis"/>
250
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
251
+ Name="Document Map"/>
252
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
253
+ Name="Plain Text"/>
254
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
255
+ Name="E-mail Signature"/>
256
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
257
+ Name="HTML Top of Form"/>
258
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
259
+ Name="HTML Bottom of Form"/>
260
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
261
+ Name="Normal (Web)"/>
262
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
263
+ Name="HTML Acronym"/>
264
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
265
+ Name="HTML Address"/>
266
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
267
+ Name="HTML Cite"/>
268
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
269
+ Name="HTML Code"/>
270
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
271
+ Name="HTML Definition"/>
272
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
273
+ Name="HTML Keyboard"/>
274
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
275
+ Name="HTML Preformatted"/>
276
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
277
+ Name="HTML Sample"/>
278
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
279
+ Name="HTML Typewriter"/>
280
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
281
+ Name="HTML Variable"/>
282
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
283
+ Name="Normal Table"/>
284
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
285
+ Name="annotation subject"/>
286
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
287
+ Name="No List"/>
288
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
289
+ Name="Outline List 1"/>
290
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
291
+ Name="Outline List 2"/>
292
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
293
+ Name="Outline List 3"/>
294
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
295
+ Name="Table Simple 1"/>
296
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
297
+ Name="Table Simple 2"/>
298
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
299
+ Name="Table Simple 3"/>
300
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
301
+ Name="Table Classic 1"/>
302
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
303
+ Name="Table Classic 2"/>
304
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
305
+ Name="Table Classic 3"/>
306
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
307
+ Name="Table Classic 4"/>
308
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
309
+ Name="Table Colorful 1"/>
310
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
311
+ Name="Table Colorful 2"/>
312
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
313
+ Name="Table Colorful 3"/>
314
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
315
+ Name="Table Columns 1"/>
316
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
317
+ Name="Table Columns 2"/>
318
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
319
+ Name="Table Columns 3"/>
320
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
321
+ Name="Table Columns 4"/>
322
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
323
+ Name="Table Columns 5"/>
324
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
325
+ Name="Table Grid 1"/>
326
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
327
+ Name="Table Grid 2"/>
328
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
329
+ Name="Table Grid 3"/>
330
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
331
+ Name="Table Grid 4"/>
332
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
333
+ Name="Table Grid 5"/>
334
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
335
+ Name="Table Grid 6"/>
336
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
337
+ Name="Table Grid 7"/>
338
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
339
+ Name="Table Grid 8"/>
340
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
341
+ Name="Table List 1"/>
342
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
343
+ Name="Table List 2"/>
344
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
345
+ Name="Table List 3"/>
346
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
347
+ Name="Table List 4"/>
348
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
349
+ Name="Table List 5"/>
350
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
351
+ Name="Table List 6"/>
352
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
353
+ Name="Table List 7"/>
354
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
355
+ Name="Table List 8"/>
356
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
357
+ Name="Table 3D effects 1"/>
358
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
359
+ Name="Table 3D effects 2"/>
360
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
361
+ Name="Table 3D effects 3"/>
362
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
363
+ Name="Table Contemporary"/>
364
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
365
+ Name="Table Elegant"/>
366
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
367
+ Name="Table Professional"/>
368
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
369
+ Name="Table Subtle 1"/>
370
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
371
+ Name="Table Subtle 2"/>
372
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
373
+ Name="Table Web 1"/>
374
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
375
+ Name="Table Web 2"/>
376
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
377
+ Name="Table Web 3"/>
378
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
379
+ Name="Balloon Text"/>
380
+ <w:LsdException Locked="false" Priority="39" Name="Table Grid"/>
381
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
382
+ Name="Table Theme"/>
383
+ <w:LsdException Locked="false" SemiHidden="true" Name="Placeholder Text"/>
384
+ <w:LsdException Locked="false" Priority="1" QFormat="true" Name="No Spacing"/>
385
+ <w:LsdException Locked="false" Priority="60" Name="Light Shading"/>
386
+ <w:LsdException Locked="false" Priority="61" Name="Light List"/>
387
+ <w:LsdException Locked="false" Priority="62" Name="Light Grid"/>
388
+ <w:LsdException Locked="false" Priority="63" Name="Medium Shading 1"/>
389
+ <w:LsdException Locked="false" Priority="64" Name="Medium Shading 2"/>
390
+ <w:LsdException Locked="false" Priority="65" Name="Medium List 1"/>
391
+ <w:LsdException Locked="false" Priority="66" Name="Medium List 2"/>
392
+ <w:LsdException Locked="false" Priority="67" Name="Medium Grid 1"/>
393
+ <w:LsdException Locked="false" Priority="68" Name="Medium Grid 2"/>
394
+ <w:LsdException Locked="false" Priority="69" Name="Medium Grid 3"/>
395
+ <w:LsdException Locked="false" Priority="70" Name="Dark List"/>
396
+ <w:LsdException Locked="false" Priority="71" Name="Colorful Shading"/>
397
+ <w:LsdException Locked="false" Priority="72" Name="Colorful List"/>
398
+ <w:LsdException Locked="false" Priority="73" Name="Colorful Grid"/>
399
+ <w:LsdException Locked="false" Priority="60" Name="Light Shading Accent 1"/>
400
+ <w:LsdException Locked="false" Priority="61" Name="Light List Accent 1"/>
401
+ <w:LsdException Locked="false" Priority="62" Name="Light Grid Accent 1"/>
402
+ <w:LsdException Locked="false" Priority="63" Name="Medium Shading 1 Accent 1"/>
403
+ <w:LsdException Locked="false" Priority="64" Name="Medium Shading 2 Accent 1"/>
404
+ <w:LsdException Locked="false" Priority="65" Name="Medium List 1 Accent 1"/>
405
+ <w:LsdException Locked="false" SemiHidden="true" Name="Revision"/>
406
+ <w:LsdException Locked="false" Priority="34" QFormat="true"
407
+ Name="List Paragraph"/>
408
+ <w:LsdException Locked="false" Priority="29" QFormat="true" Name="Quote"/>
409
+ <w:LsdException Locked="false" Priority="30" QFormat="true"
410
+ Name="Intense Quote"/>
411
+ <w:LsdException Locked="false" Priority="66" Name="Medium List 2 Accent 1"/>
412
+ <w:LsdException Locked="false" Priority="67" Name="Medium Grid 1 Accent 1"/>
413
+ <w:LsdException Locked="false" Priority="68" Name="Medium Grid 2 Accent 1"/>
414
+ <w:LsdException Locked="false" Priority="69" Name="Medium Grid 3 Accent 1"/>
415
+ <w:LsdException Locked="false" Priority="70" Name="Dark List Accent 1"/>
416
+ <w:LsdException Locked="false" Priority="71" Name="Colorful Shading Accent 1"/>
417
+ <w:LsdException Locked="false" Priority="72" Name="Colorful List Accent 1"/>
418
+ <w:LsdException Locked="false" Priority="73" Name="Colorful Grid Accent 1"/>
419
+ <w:LsdException Locked="false" Priority="60" Name="Light Shading Accent 2"/>
420
+ <w:LsdException Locked="false" Priority="61" Name="Light List Accent 2"/>
421
+ <w:LsdException Locked="false" Priority="62" Name="Light Grid Accent 2"/>
422
+ <w:LsdException Locked="false" Priority="63" Name="Medium Shading 1 Accent 2"/>
423
+ <w:LsdException Locked="false" Priority="64" Name="Medium Shading 2 Accent 2"/>
424
+ <w:LsdException Locked="false" Priority="65" Name="Medium List 1 Accent 2"/>
425
+ <w:LsdException Locked="false" Priority="66" Name="Medium List 2 Accent 2"/>
426
+ <w:LsdException Locked="false" Priority="67" Name="Medium Grid 1 Accent 2"/>
427
+ <w:LsdException Locked="false" Priority="68" Name="Medium Grid 2 Accent 2"/>
428
+ <w:LsdException Locked="false" Priority="69" Name="Medium Grid 3 Accent 2"/>
429
+ <w:LsdException Locked="false" Priority="70" Name="Dark List Accent 2"/>
430
+ <w:LsdException Locked="false" Priority="71" Name="Colorful Shading Accent 2"/>
431
+ <w:LsdException Locked="false" Priority="72" Name="Colorful List Accent 2"/>
432
+ <w:LsdException Locked="false" Priority="73" Name="Colorful Grid Accent 2"/>
433
+ <w:LsdException Locked="false" Priority="60" Name="Light Shading Accent 3"/>
434
+ <w:LsdException Locked="false" Priority="61" Name="Light List Accent 3"/>
435
+ <w:LsdException Locked="false" Priority="62" Name="Light Grid Accent 3"/>
436
+ <w:LsdException Locked="false" Priority="63" Name="Medium Shading 1 Accent 3"/>
437
+ <w:LsdException Locked="false" Priority="64" Name="Medium Shading 2 Accent 3"/>
438
+ <w:LsdException Locked="false" Priority="65" Name="Medium List 1 Accent 3"/>
439
+ <w:LsdException Locked="false" Priority="66" Name="Medium List 2 Accent 3"/>
440
+ <w:LsdException Locked="false" Priority="67" Name="Medium Grid 1 Accent 3"/>
441
+ <w:LsdException Locked="false" Priority="68" Name="Medium Grid 2 Accent 3"/>
442
+ <w:LsdException Locked="false" Priority="69" Name="Medium Grid 3 Accent 3"/>
443
+ <w:LsdException Locked="false" Priority="70" Name="Dark List Accent 3"/>
444
+ <w:LsdException Locked="false" Priority="71" Name="Colorful Shading Accent 3"/>
445
+ <w:LsdException Locked="false" Priority="72" Name="Colorful List Accent 3"/>
446
+ <w:LsdException Locked="false" Priority="73" Name="Colorful Grid Accent 3"/>
447
+ <w:LsdException Locked="false" Priority="60" Name="Light Shading Accent 4"/>
448
+ <w:LsdException Locked="false" Priority="61" Name="Light List Accent 4"/>
449
+ <w:LsdException Locked="false" Priority="62" Name="Light Grid Accent 4"/>
450
+ <w:LsdException Locked="false" Priority="63" Name="Medium Shading 1 Accent 4"/>
451
+ <w:LsdException Locked="false" Priority="64" Name="Medium Shading 2 Accent 4"/>
452
+ <w:LsdException Locked="false" Priority="65" Name="Medium List 1 Accent 4"/>
453
+ <w:LsdException Locked="false" Priority="66" Name="Medium List 2 Accent 4"/>
454
+ <w:LsdException Locked="false" Priority="67" Name="Medium Grid 1 Accent 4"/>
455
+ <w:LsdException Locked="false" Priority="68" Name="Medium Grid 2 Accent 4"/>
456
+ <w:LsdException Locked="false" Priority="69" Name="Medium Grid 3 Accent 4"/>
457
+ <w:LsdException Locked="false" Priority="70" Name="Dark List Accent 4"/>
458
+ <w:LsdException Locked="false" Priority="71" Name="Colorful Shading Accent 4"/>
459
+ <w:LsdException Locked="false" Priority="72" Name="Colorful List Accent 4"/>
460
+ <w:LsdException Locked="false" Priority="73" Name="Colorful Grid Accent 4"/>
461
+ <w:LsdException Locked="false" Priority="60" Name="Light Shading Accent 5"/>
462
+ <w:LsdException Locked="false" Priority="61" Name="Light List Accent 5"/>
463
+ <w:LsdException Locked="false" Priority="62" Name="Light Grid Accent 5"/>
464
+ <w:LsdException Locked="false" Priority="63" Name="Medium Shading 1 Accent 5"/>
465
+ <w:LsdException Locked="false" Priority="64" Name="Medium Shading 2 Accent 5"/>
466
+ <w:LsdException Locked="false" Priority="65" Name="Medium List 1 Accent 5"/>
467
+ <w:LsdException Locked="false" Priority="66" Name="Medium List 2 Accent 5"/>
468
+ <w:LsdException Locked="false" Priority="67" Name="Medium Grid 1 Accent 5"/>
469
+ <w:LsdException Locked="false" Priority="68" Name="Medium Grid 2 Accent 5"/>
470
+ <w:LsdException Locked="false" Priority="69" Name="Medium Grid 3 Accent 5"/>
471
+ <w:LsdException Locked="false" Priority="70" Name="Dark List Accent 5"/>
472
+ <w:LsdException Locked="false" Priority="71" Name="Colorful Shading Accent 5"/>
473
+ <w:LsdException Locked="false" Priority="72" Name="Colorful List Accent 5"/>
474
+ <w:LsdException Locked="false" Priority="73" Name="Colorful Grid Accent 5"/>
475
+ <w:LsdException Locked="false" Priority="60" Name="Light Shading Accent 6"/>
476
+ <w:LsdException Locked="false" Priority="61" Name="Light List Accent 6"/>
477
+ <w:LsdException Locked="false" Priority="62" Name="Light Grid Accent 6"/>
478
+ <w:LsdException Locked="false" Priority="63" Name="Medium Shading 1 Accent 6"/>
479
+ <w:LsdException Locked="false" Priority="64" Name="Medium Shading 2 Accent 6"/>
480
+ <w:LsdException Locked="false" Priority="65" Name="Medium List 1 Accent 6"/>
481
+ <w:LsdException Locked="false" Priority="66" Name="Medium List 2 Accent 6"/>
482
+ <w:LsdException Locked="false" Priority="67" Name="Medium Grid 1 Accent 6"/>
483
+ <w:LsdException Locked="false" Priority="68" Name="Medium Grid 2 Accent 6"/>
484
+ <w:LsdException Locked="false" Priority="69" Name="Medium Grid 3 Accent 6"/>
485
+ <w:LsdException Locked="false" Priority="70" Name="Dark List Accent 6"/>
486
+ <w:LsdException Locked="false" Priority="71" Name="Colorful Shading Accent 6"/>
487
+ <w:LsdException Locked="false" Priority="72" Name="Colorful List Accent 6"/>
488
+ <w:LsdException Locked="false" Priority="73" Name="Colorful Grid Accent 6"/>
489
+ <w:LsdException Locked="false" Priority="19" QFormat="true"
490
+ Name="Subtle Emphasis"/>
491
+ <w:LsdException Locked="false" Priority="21" QFormat="true"
492
+ Name="Intense Emphasis"/>
493
+ <w:LsdException Locked="false" Priority="31" QFormat="true"
494
+ Name="Subtle Reference"/>
495
+ <w:LsdException Locked="false" Priority="32" QFormat="true"
496
+ Name="Intense Reference"/>
497
+ <w:LsdException Locked="false" Priority="33" QFormat="true" Name="Book Title"/>
498
+ <w:LsdException Locked="false" Priority="37" SemiHidden="true"
499
+ UnhideWhenUsed="true" Name="Bibliography"/>
500
+ <w:LsdException Locked="false" Priority="39" SemiHidden="true"
501
+ UnhideWhenUsed="true" QFormat="true" Name="TOC Heading"/>
502
+ <w:LsdException Locked="false" Priority="41" Name="Plain Table 1"/>
503
+ <w:LsdException Locked="false" Priority="42" Name="Plain Table 2"/>
504
+ <w:LsdException Locked="false" Priority="43" Name="Plain Table 3"/>
505
+ <w:LsdException Locked="false" Priority="44" Name="Plain Table 4"/>
506
+ <w:LsdException Locked="false" Priority="45" Name="Plain Table 5"/>
507
+ <w:LsdException Locked="false" Priority="40" Name="Grid Table Light"/>
508
+ <w:LsdException Locked="false" Priority="46" Name="Grid Table 1 Light"/>
509
+ <w:LsdException Locked="false" Priority="47" Name="Grid Table 2"/>
510
+ <w:LsdException Locked="false" Priority="48" Name="Grid Table 3"/>
511
+ <w:LsdException Locked="false" Priority="49" Name="Grid Table 4"/>
512
+ <w:LsdException Locked="false" Priority="50" Name="Grid Table 5 Dark"/>
513
+ <w:LsdException Locked="false" Priority="51" Name="Grid Table 6 Colorful"/>
514
+ <w:LsdException Locked="false" Priority="52" Name="Grid Table 7 Colorful"/>
515
+ <w:LsdException Locked="false" Priority="46"
516
+ Name="Grid Table 1 Light Accent 1"/>
517
+ <w:LsdException Locked="false" Priority="47" Name="Grid Table 2 Accent 1"/>
518
+ <w:LsdException Locked="false" Priority="48" Name="Grid Table 3 Accent 1"/>
519
+ <w:LsdException Locked="false" Priority="49" Name="Grid Table 4 Accent 1"/>
520
+ <w:LsdException Locked="false" Priority="50" Name="Grid Table 5 Dark Accent 1"/>
521
+ <w:LsdException Locked="false" Priority="51"
522
+ Name="Grid Table 6 Colorful Accent 1"/>
523
+ <w:LsdException Locked="false" Priority="52"
524
+ Name="Grid Table 7 Colorful Accent 1"/>
525
+ <w:LsdException Locked="false" Priority="46"
526
+ Name="Grid Table 1 Light Accent 2"/>
527
+ <w:LsdException Locked="false" Priority="47" Name="Grid Table 2 Accent 2"/>
528
+ <w:LsdException Locked="false" Priority="48" Name="Grid Table 3 Accent 2"/>
529
+ <w:LsdException Locked="false" Priority="49" Name="Grid Table 4 Accent 2"/>
530
+ <w:LsdException Locked="false" Priority="50" Name="Grid Table 5 Dark Accent 2"/>
531
+ <w:LsdException Locked="false" Priority="51"
532
+ Name="Grid Table 6 Colorful Accent 2"/>
533
+ <w:LsdException Locked="false" Priority="52"
534
+ Name="Grid Table 7 Colorful Accent 2"/>
535
+ <w:LsdException Locked="false" Priority="46"
536
+ Name="Grid Table 1 Light Accent 3"/>
537
+ <w:LsdException Locked="false" Priority="47" Name="Grid Table 2 Accent 3"/>
538
+ <w:LsdException Locked="false" Priority="48" Name="Grid Table 3 Accent 3"/>
539
+ <w:LsdException Locked="false" Priority="49" Name="Grid Table 4 Accent 3"/>
540
+ <w:LsdException Locked="false" Priority="50" Name="Grid Table 5 Dark Accent 3"/>
541
+ <w:LsdException Locked="false" Priority="51"
542
+ Name="Grid Table 6 Colorful Accent 3"/>
543
+ <w:LsdException Locked="false" Priority="52"
544
+ Name="Grid Table 7 Colorful Accent 3"/>
545
+ <w:LsdException Locked="false" Priority="46"
546
+ Name="Grid Table 1 Light Accent 4"/>
547
+ <w:LsdException Locked="false" Priority="47" Name="Grid Table 2 Accent 4"/>
548
+ <w:LsdException Locked="false" Priority="48" Name="Grid Table 3 Accent 4"/>
549
+ <w:LsdException Locked="false" Priority="49" Name="Grid Table 4 Accent 4"/>
550
+ <w:LsdException Locked="false" Priority="50" Name="Grid Table 5 Dark Accent 4"/>
551
+ <w:LsdException Locked="false" Priority="51"
552
+ Name="Grid Table 6 Colorful Accent 4"/>
553
+ <w:LsdException Locked="false" Priority="52"
554
+ Name="Grid Table 7 Colorful Accent 4"/>
555
+ <w:LsdException Locked="false" Priority="46"
556
+ Name="Grid Table 1 Light Accent 5"/>
557
+ <w:LsdException Locked="false" Priority="47" Name="Grid Table 2 Accent 5"/>
558
+ <w:LsdException Locked="false" Priority="48" Name="Grid Table 3 Accent 5"/>
559
+ <w:LsdException Locked="false" Priority="49" Name="Grid Table 4 Accent 5"/>
560
+ <w:LsdException Locked="false" Priority="50" Name="Grid Table 5 Dark Accent 5"/>
561
+ <w:LsdException Locked="false" Priority="51"
562
+ Name="Grid Table 6 Colorful Accent 5"/>
563
+ <w:LsdException Locked="false" Priority="52"
564
+ Name="Grid Table 7 Colorful Accent 5"/>
565
+ <w:LsdException Locked="false" Priority="46"
566
+ Name="Grid Table 1 Light Accent 6"/>
567
+ <w:LsdException Locked="false" Priority="47" Name="Grid Table 2 Accent 6"/>
568
+ <w:LsdException Locked="false" Priority="48" Name="Grid Table 3 Accent 6"/>
569
+ <w:LsdException Locked="false" Priority="49" Name="Grid Table 4 Accent 6"/>
570
+ <w:LsdException Locked="false" Priority="50" Name="Grid Table 5 Dark Accent 6"/>
571
+ <w:LsdException Locked="false" Priority="51"
572
+ Name="Grid Table 6 Colorful Accent 6"/>
573
+ <w:LsdException Locked="false" Priority="52"
574
+ Name="Grid Table 7 Colorful Accent 6"/>
575
+ <w:LsdException Locked="false" Priority="46" Name="List Table 1 Light"/>
576
+ <w:LsdException Locked="false" Priority="47" Name="List Table 2"/>
577
+ <w:LsdException Locked="false" Priority="48" Name="List Table 3"/>
578
+ <w:LsdException Locked="false" Priority="49" Name="List Table 4"/>
579
+ <w:LsdException Locked="false" Priority="50" Name="List Table 5 Dark"/>
580
+ <w:LsdException Locked="false" Priority="51" Name="List Table 6 Colorful"/>
581
+ <w:LsdException Locked="false" Priority="52" Name="List Table 7 Colorful"/>
582
+ <w:LsdException Locked="false" Priority="46"
583
+ Name="List Table 1 Light Accent 1"/>
584
+ <w:LsdException Locked="false" Priority="47" Name="List Table 2 Accent 1"/>
585
+ <w:LsdException Locked="false" Priority="48" Name="List Table 3 Accent 1"/>
586
+ <w:LsdException Locked="false" Priority="49" Name="List Table 4 Accent 1"/>
587
+ <w:LsdException Locked="false" Priority="50" Name="List Table 5 Dark Accent 1"/>
588
+ <w:LsdException Locked="false" Priority="51"
589
+ Name="List Table 6 Colorful Accent 1"/>
590
+ <w:LsdException Locked="false" Priority="52"
591
+ Name="List Table 7 Colorful Accent 1"/>
592
+ <w:LsdException Locked="false" Priority="46"
593
+ Name="List Table 1 Light Accent 2"/>
594
+ <w:LsdException Locked="false" Priority="47" Name="List Table 2 Accent 2"/>
595
+ <w:LsdException Locked="false" Priority="48" Name="List Table 3 Accent 2"/>
596
+ <w:LsdException Locked="false" Priority="49" Name="List Table 4 Accent 2"/>
597
+ <w:LsdException Locked="false" Priority="50" Name="List Table 5 Dark Accent 2"/>
598
+ <w:LsdException Locked="false" Priority="51"
599
+ Name="List Table 6 Colorful Accent 2"/>
600
+ <w:LsdException Locked="false" Priority="52"
601
+ Name="List Table 7 Colorful Accent 2"/>
602
+ <w:LsdException Locked="false" Priority="46"
603
+ Name="List Table 1 Light Accent 3"/>
604
+ <w:LsdException Locked="false" Priority="47" Name="List Table 2 Accent 3"/>
605
+ <w:LsdException Locked="false" Priority="48" Name="List Table 3 Accent 3"/>
606
+ <w:LsdException Locked="false" Priority="49" Name="List Table 4 Accent 3"/>
607
+ <w:LsdException Locked="false" Priority="50" Name="List Table 5 Dark Accent 3"/>
608
+ <w:LsdException Locked="false" Priority="51"
609
+ Name="List Table 6 Colorful Accent 3"/>
610
+ <w:LsdException Locked="false" Priority="52"
611
+ Name="List Table 7 Colorful Accent 3"/>
612
+ <w:LsdException Locked="false" Priority="46"
613
+ Name="List Table 1 Light Accent 4"/>
614
+ <w:LsdException Locked="false" Priority="47" Name="List Table 2 Accent 4"/>
615
+ <w:LsdException Locked="false" Priority="48" Name="List Table 3 Accent 4"/>
616
+ <w:LsdException Locked="false" Priority="49" Name="List Table 4 Accent 4"/>
617
+ <w:LsdException Locked="false" Priority="50" Name="List Table 5 Dark Accent 4"/>
618
+ <w:LsdException Locked="false" Priority="51"
619
+ Name="List Table 6 Colorful Accent 4"/>
620
+ <w:LsdException Locked="false" Priority="52"
621
+ Name="List Table 7 Colorful Accent 4"/>
622
+ <w:LsdException Locked="false" Priority="46"
623
+ Name="List Table 1 Light Accent 5"/>
624
+ <w:LsdException Locked="false" Priority="47" Name="List Table 2 Accent 5"/>
625
+ <w:LsdException Locked="false" Priority="48" Name="List Table 3 Accent 5"/>
626
+ <w:LsdException Locked="false" Priority="49" Name="List Table 4 Accent 5"/>
627
+ <w:LsdException Locked="false" Priority="50" Name="List Table 5 Dark Accent 5"/>
628
+ <w:LsdException Locked="false" Priority="51"
629
+ Name="List Table 6 Colorful Accent 5"/>
630
+ <w:LsdException Locked="false" Priority="52"
631
+ Name="List Table 7 Colorful Accent 5"/>
632
+ <w:LsdException Locked="false" Priority="46"
633
+ Name="List Table 1 Light Accent 6"/>
634
+ <w:LsdException Locked="false" Priority="47" Name="List Table 2 Accent 6"/>
635
+ <w:LsdException Locked="false" Priority="48" Name="List Table 3 Accent 6"/>
636
+ <w:LsdException Locked="false" Priority="49" Name="List Table 4 Accent 6"/>
637
+ <w:LsdException Locked="false" Priority="50" Name="List Table 5 Dark Accent 6"/>
638
+ <w:LsdException Locked="false" Priority="51"
639
+ Name="List Table 6 Colorful Accent 6"/>
640
+ <w:LsdException Locked="false" Priority="52"
641
+ Name="List Table 7 Colorful Accent 6"/>
642
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
643
+ Name="Mention"/>
644
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
645
+ Name="Smart Hyperlink"/>
646
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
647
+ Name="Hashtag"/>
648
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
649
+ Name="Unresolved Mention"/>
650
+ <w:LsdException Locked="false" SemiHidden="true" UnhideWhenUsed="true"
651
+ Name="Smart Link"/>
652
+ </w:LatentStyles>
653
+ </xml><![endif]-->
654
+ <style>
655
+ <!--
656
+ /* Font Definitions */
657
+ @font-face
658
+ {font-family:"Cambria Math";
659
+ panose-1:2 4 5 3 5 4 6 3 2 4;
660
+ mso-font-charset:0;
661
+ mso-generic-font-family:roman;
662
+ mso-font-pitch:variable;
663
+ mso-font-signature:-536869121 1107305727 33554432 0 415 0;}
664
+ @font-face
665
+ {font-family:Calibri;
666
+ panose-1:2 15 5 2 2 2 4 3 2 4;
667
+ mso-font-charset:0;
668
+ mso-generic-font-family:swiss;
669
+ mso-font-pitch:variable;
670
+ mso-font-signature:-469750017 -1073732485 9 0 511 0;}
671
+ /* Style Definitions */
672
+ p.MsoNormal, li.MsoNormal, div.MsoNormal
673
+ {mso-style-unhide:no;
674
+ mso-style-qformat:yes;
675
+ mso-style-parent:"";
676
+ margin-top:0cm;
677
+ margin-right:0cm;
678
+ margin-bottom:8.0pt;
679
+ margin-left:0cm;
680
+ line-height:107%;
681
+ mso-pagination:widow-orphan;
682
+ font-size:11.0pt;
683
+ font-family:"Calibri",sans-serif;
684
+ mso-ascii-font-family:Calibri;
685
+ mso-ascii-theme-font:minor-latin;
686
+ mso-fareast-font-family:Calibri;
687
+ mso-fareast-theme-font:minor-latin;
688
+ mso-hansi-font-family:Calibri;
689
+ mso-hansi-theme-font:minor-latin;
690
+ mso-bidi-font-family:"Times New Roman";
691
+ mso-bidi-theme-font:minor-bidi;
692
+ mso-font-kerning:1.0pt;
693
+ mso-ligatures:standardcontextual;
694
+ mso-fareast-language:EN-US;}
695
+ .MsoChpDefault
696
+ {mso-style-type:export-only;
697
+ mso-default-props:yes;
698
+ font-family:"Calibri",sans-serif;
699
+ mso-ascii-font-family:Calibri;
700
+ mso-ascii-theme-font:minor-latin;
701
+ mso-fareast-font-family:Calibri;
702
+ mso-fareast-theme-font:minor-latin;
703
+ mso-hansi-font-family:Calibri;
704
+ mso-hansi-theme-font:minor-latin;
705
+ mso-bidi-font-family:"Times New Roman";
706
+ mso-bidi-theme-font:minor-bidi;
707
+ mso-fareast-language:EN-US;}
708
+ .MsoPapDefault
709
+ {mso-style-type:export-only;
710
+ margin-bottom:8.0pt;
711
+ line-height:107%;}
712
+ @page WordSection1
713
+ {size:595.3pt 841.9pt;
714
+ margin:72.0pt 72.0pt 72.0pt 72.0pt;
715
+ mso-header-margin:35.4pt;
716
+ mso-footer-margin:35.4pt;
717
+ mso-paper-source:0;}
718
+ div.WordSection1
719
+ {page:WordSection1;}
720
+ -->
721
+ </style>
722
+ <!--[if gte mso 10]>
723
+ <style>
724
+ /* Style Definitions */
725
+ table.MsoNormalTable
726
+ {mso-style-name:"Table Normal";
727
+ mso-tstyle-rowband-size:0;
728
+ mso-tstyle-colband-size:0;
729
+ mso-style-noshow:yes;
730
+ mso-style-priority:99;
731
+ mso-style-parent:"";
732
+ mso-padding-alt:0cm 5.4pt 0cm 5.4pt;
733
+ mso-para-margin-top:0cm;
734
+ mso-para-margin-right:0cm;
735
+ mso-para-margin-bottom:8.0pt;
736
+ mso-para-margin-left:0cm;
737
+ line-height:107%;
738
+ mso-pagination:widow-orphan;
739
+ font-size:11.0pt;
740
+ font-family:"Calibri",sans-serif;
741
+ mso-ascii-font-family:Calibri;
742
+ mso-ascii-theme-font:minor-latin;
743
+ mso-hansi-font-family:Calibri;
744
+ mso-hansi-theme-font:minor-latin;
745
+ mso-bidi-font-family:"Times New Roman";
746
+ mso-bidi-theme-font:minor-bidi;
747
+ mso-font-kerning:1.0pt;
748
+ mso-ligatures:standardcontextual;
749
+ mso-fareast-language:EN-US;}
750
+ </style>
751
+ <![endif]--><!--[if gte mso 9]><xml>
752
+ <o:shapedefaults v:ext="edit" spidmax="1026"/>
753
+ </xml><![endif]--><!--[if gte mso 9]><xml>
754
+ <o:shapelayout v:ext="edit">
755
+ <o:idmap v:ext="edit" data="1"/>
756
+ </o:shapelayout></xml><![endif]-->
757
+ </head>
758
+
759
+ <body lang=EN-GB style='tab-interval:36.0pt;word-wrap:break-word'>
760
+
761
+ <div class=WordSection1>
762
+
763
+ <p class=MsoNormal>Hello, World!</p>
764
+
765
+ </div>
766
+
767
+ </body>
768
+
769
+ </html>
test/sample.pdf ADDED
Binary file (30 kB). View file
 
test/sample.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Hello, World!
test/sample_files/colorschememapping.xml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
2
+ <a:clrMap xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" bg1="lt1" tx1="dk1" bg2="lt2" tx2="dk2" accent1="accent1" accent2="accent2" accent3="accent3" accent4="accent4" accent5="accent5" accent6="accent6" hlink="hlink" folHlink="folHlink"/>
test/sample_files/filelist.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <xml xmlns:o="urn:schemas-microsoft-com:office:office">
2
+ <o:MainFile HRef="../sample.html"/>
3
+ <o:File HRef="themedata.thmx"/>
4
+ <o:File HRef="colorschememapping.xml"/>
5
+ <o:File HRef="filelist.xml"/>
6
+ </xml>
test/sample_files/themedata.thmx ADDED
Binary file (3.34 kB). View file
 
test/test_module.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ---
2
+ # jupyter:
3
+ # jupytext:
4
+ # formats: ipynb,py:light
5
+ # text_representation:
6
+ # extension: .py
7
+ # format_name: light
8
+ # format_version: '1.5'
9
+ # jupytext_version: 1.15.0
10
+ # kernelspec:
11
+ # display_name: Python 3 (ipykernel)
12
+ # language: python
13
+ # name: python3
14
+ # ---
15
+
16
+ # +
17
+ import pytest
18
+ import gradio as gr
19
+ from ..chatfuncs.ingest import *
20
+ from ..chatfuncs.chatfuncs import *
21
+
22
+ def test_read_docx():
23
+ content = read_docx('sample.docx')
24
+ assert content == "Hello, World!"
25
+
26
+
27
+ # +
28
+ def test_parse_file():
29
+ # Assuming these files exist and you know their content
30
+ files = ['sample.docx', 'sample.pdf', 'sample.txt', 'sample.html']
31
+ contents = parse_file(files)
32
+
33
+ assert contents['sample.docx'] == 'Hello, World!'
34
+ assert contents['sample.pdf'] == 'Hello, World!'
35
+ assert contents['sample.txt'] == 'Hello, World!'
36
+ assert contents['sample.html'] == 'Hello, World!'
37
+
38
+ def test_unsupported_file_type():
39
+ files = ['sample.unknown']
40
+ contents = parse_file(files)
41
+ assert contents['sample.unknown'].startswith('Unsupported file type:')
42
+
43
+ def test_input_validation():
44
+ with pytest.raises(ValueError, match="Expected a list of file paths."):
45
+ parse_file('single_file_path.txt')