Spaces:
Sleeping
Sleeping
seanpedrickcase
commited on
Commit
•
2cb9977
1
Parent(s):
2a8aba8
Faster embedding with GPU, fast document split, writes to chromadb file correctly. No longer needs FAISS or langchain
Browse files- .gitignore +2 -1
- app.py +127 -40
- requirements.txt +2 -0
- search_funcs/ingest.py +86 -137
.gitignore
CHANGED
@@ -13,4 +13,5 @@
|
|
13 |
*.ipynb
|
14 |
build/*
|
15 |
dist/*
|
16 |
-
__pycache__/*
|
|
|
|
13 |
*.ipynb
|
14 |
build/*
|
15 |
dist/*
|
16 |
+
__pycache__/*
|
17 |
+
db/*
|
app.py
CHANGED
@@ -8,27 +8,43 @@ nltk.download('punkt')
|
|
8 |
from search_funcs.fast_bm25 import BM25
|
9 |
from search_funcs.clean_funcs import initial_clean, get_lemma_tokens#, stem_sentence
|
10 |
from nltk import word_tokenize
|
|
|
11 |
|
12 |
PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
|
13 |
|
14 |
import gradio as gr
|
15 |
import pandas as pd
|
16 |
import os
|
|
|
|
|
17 |
|
18 |
-
from itertools import compress
|
19 |
-
|
20 |
-
#from langchain.embeddings import HuggingFaceEmbeddings
|
21 |
-
#from langchain.vectorstores import FAISS
|
22 |
from transformers import AutoModel
|
23 |
|
|
|
|
|
|
|
|
|
|
|
24 |
import search_funcs.ingest as ing
|
25 |
-
import search_funcs.chatfuncs as chatf
|
26 |
|
27 |
# Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
|
28 |
import chromadb
|
29 |
#from typing_extensions import Protocol
|
30 |
#from chromadb import Documents, EmbeddingFunction, Embeddings
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
# Remove Chroma database file. If it exists as it can cause issues
|
33 |
chromadb_file = "chroma.sqlite3"
|
34 |
|
@@ -176,14 +192,14 @@ def bm25_search(free_text_query, in_no_search_results, original_data, text_colum
|
|
176 |
join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
|
177 |
results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
|
178 |
|
|
|
|
|
|
|
179 |
results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1)
|
180 |
|
181 |
-
|
182 |
# Reorder results by score
|
183 |
results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
|
184 |
|
185 |
-
|
186 |
-
|
187 |
# Out file
|
188 |
results_df_name = "search_result.csv"
|
189 |
results_df_out.to_csv(results_df_name, index= None)
|
@@ -227,7 +243,7 @@ def put_columns_in_df(in_file, in_bm25_column):
|
|
227 |
df = read_file(in_file.name)
|
228 |
new_choices = list(df.columns)
|
229 |
|
230 |
-
print(new_choices)
|
231 |
|
232 |
concat_choices.extend(new_choices)
|
233 |
|
@@ -279,7 +295,7 @@ def load_embeddings(embeddings_name = "jinaai/jina-embeddings-v2-small-en"):
|
|
279 |
# Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
|
280 |
|
281 |
#else:
|
282 |
-
embeddings_func = AutoModel.from_pretrained(embeddings_name, trust_remote_code=True)
|
283 |
|
284 |
global embeddings
|
285 |
|
@@ -288,10 +304,12 @@ def load_embeddings(embeddings_name = "jinaai/jina-embeddings-v2-small-en"):
|
|
288 |
return embeddings
|
289 |
|
290 |
# Load embeddings
|
291 |
-
embeddings_name =
|
292 |
-
|
293 |
-
embeddings_model =
|
294 |
-
|
|
|
|
|
295 |
|
296 |
def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress()):
|
297 |
'''
|
@@ -300,35 +318,92 @@ def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress(
|
|
300 |
|
301 |
print(f"> Total split documents: {len(docs_out)}")
|
302 |
|
303 |
-
|
304 |
|
305 |
page_contents = [doc.page_content for doc in docs_out]
|
306 |
page_meta = [doc.metadata for doc in docs_out]
|
307 |
ids_range = range(0,len(page_contents))
|
308 |
ids = [str(element) for element in ids_range]
|
309 |
|
310 |
-
|
311 |
-
|
312 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
313 |
|
314 |
|
315 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
|
317 |
-
# Create a new Chroma collection to store the supporting evidence. We don't need to specify an embedding fuction, and the default will be used.
|
318 |
try:
|
319 |
-
|
|
|
320 |
client.delete_collection(name="my_collection")
|
|
|
|
|
321 |
except:
|
|
|
322 |
collection = client.create_collection(name="my_collection")
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
329 |
|
330 |
#chatf.vectorstore = vectorstore_func
|
331 |
|
|
|
|
|
|
|
|
|
|
|
332 |
out_message = "Document processing complete"
|
333 |
|
334 |
return out_message, collection
|
@@ -381,37 +456,45 @@ def chroma_retrieval(new_question_kworded:str, vectorstore, docs, orig_df_col:st
|
|
381 |
#df_docs = df#.apply(lambda x: x.explode()).reset_index(drop=True)
|
382 |
|
383 |
# Keep only documents with a certain score
|
|
|
|
|
384 |
|
385 |
docs_scores = df_docs["distances"] #.astype(float)
|
386 |
|
387 |
# Only keep sources that are sufficiently relevant (i.e. similarity search score below threshold below)
|
388 |
score_more_limit = df_docs.loc[docs_scores < vec_score_cut_off, :]
|
389 |
-
docs_keep = create_docs_keep_from_df(score_more_limit) #list(compress(docs, score_more_limit))
|
390 |
|
391 |
#print(docs_keep)
|
392 |
|
393 |
-
if
|
394 |
-
return 'No result found!',
|
395 |
|
396 |
# Only keep sources that are at least 100 characters long
|
397 |
docs_len = score_more_limit["documents"].str.len() >= 100
|
398 |
-
|
399 |
-
|
|
|
|
|
|
|
400 |
|
401 |
#print(length_more_limit)
|
402 |
|
403 |
-
if
|
404 |
-
return 'No result found!',
|
405 |
|
406 |
length_more_limit['ids'] = length_more_limit['ids'].astype(int)
|
407 |
|
408 |
#length_more_limit.to_csv("length_more_limit.csv", index = None)
|
409 |
|
410 |
# Explode the 'metadatas' dictionary into separate columns
|
411 |
-
df_metadata_expanded =
|
|
|
|
|
|
|
412 |
|
413 |
# Concatenate the original DataFrame with the expanded metadata DataFrame
|
414 |
-
results_df_out = pd.concat([
|
415 |
|
416 |
results_df_out = results_df_out.rename(columns={"documents":orig_df_col})
|
417 |
|
@@ -428,6 +511,10 @@ def chroma_retrieval(new_question_kworded:str, vectorstore, docs, orig_df_col:st
|
|
428 |
# Import data
|
429 |
join_df = read_file(join_filename)
|
430 |
join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
|
|
|
|
|
|
|
|
|
431 |
results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
|
432 |
|
433 |
results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1)
|
@@ -435,7 +522,7 @@ def chroma_retrieval(new_question_kworded:str, vectorstore, docs, orig_df_col:st
|
|
435 |
|
436 |
results_df_name = "semantic_search_result.csv"
|
437 |
results_df_out.to_csv(results_df_name, index= None)
|
438 |
-
results_first_text = results_df_out[orig_df_col][0]
|
439 |
|
440 |
return results_first_text, results_df_name
|
441 |
|
@@ -452,7 +539,7 @@ with block:
|
|
452 |
|
453 |
k_val = gr.State(9999)
|
454 |
out_passages = gr.State(9999)
|
455 |
-
vec_score_cut_off = gr.State(
|
456 |
vec_weight = gr.State(1)
|
457 |
|
458 |
docs_keep_as_doc_state = gr.State()
|
@@ -512,7 +599,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
512 |
with gr.Accordion("Load in data", open = True):
|
513 |
in_semantic_file = gr.File(label="Upload data file for semantic search")
|
514 |
in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
|
515 |
-
load_semantic_data_button = gr.Button(value="Load in
|
516 |
|
517 |
ingest_embed_out = gr.Textbox(label="File/web page preparation progress")
|
518 |
semantic_query = gr.Textbox(label="Enter semantic search query here")
|
@@ -572,7 +659,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
572 |
# Load in a csv/excel file for semantic search
|
573 |
in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column])
|
574 |
load_semantic_data_button.click(ing.parse_csv_or_excel, inputs=[in_semantic_file, in_semantic_column], outputs=[ingest_text, current_source_semantic]).\
|
575 |
-
then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_column], outputs=[ingest_docs]).\
|
576 |
then(docs_to_chroma_save, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state])
|
577 |
|
578 |
# Semantic search query
|
|
|
8 |
from search_funcs.fast_bm25 import BM25
|
9 |
from search_funcs.clean_funcs import initial_clean, get_lemma_tokens#, stem_sentence
|
10 |
from nltk import word_tokenize
|
11 |
+
#from sentence_transformers import SentenceTransformer
|
12 |
|
13 |
PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
|
14 |
|
15 |
import gradio as gr
|
16 |
import pandas as pd
|
17 |
import os
|
18 |
+
import time
|
19 |
+
from chromadb.config import Settings
|
20 |
|
|
|
|
|
|
|
|
|
21 |
from transformers import AutoModel
|
22 |
|
23 |
+
# model = AutoModel.from_pretrained('./model_and_tokenizer/int8-model.onnx', use_embedding_runtime=True)
|
24 |
+
# sentence_embeddings = model.generate(engine_input)['last_hidden_state:0']
|
25 |
+
|
26 |
+
# print("Sentence embeddings:", sentence_embeddings)
|
27 |
+
|
28 |
import search_funcs.ingest as ing
|
29 |
+
#import search_funcs.chatfuncs as chatf
|
30 |
|
31 |
# Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
|
32 |
import chromadb
|
33 |
#from typing_extensions import Protocol
|
34 |
#from chromadb import Documents, EmbeddingFunction, Embeddings
|
35 |
|
36 |
+
from torch import cuda, backends
|
37 |
+
|
38 |
+
# Check for torch cuda
|
39 |
+
print(cuda.is_available())
|
40 |
+
print(backends.cudnn.enabled)
|
41 |
+
if cuda.is_available():
|
42 |
+
torch_device = "cuda"
|
43 |
+
os.system("nvidia-smi")
|
44 |
+
|
45 |
+
else:
|
46 |
+
torch_device = "cpu"
|
47 |
+
|
48 |
# Remove Chroma database file. If it exists as it can cause issues
|
49 |
chromadb_file = "chroma.sqlite3"
|
50 |
|
|
|
192 |
join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
|
193 |
results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
|
194 |
|
195 |
+
# Duplicates dropped so as not to expand out dataframe
|
196 |
+
join_df = join_df.drop_duplicates(in_join_column)
|
197 |
+
|
198 |
results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1)
|
199 |
|
|
|
200 |
# Reorder results by score
|
201 |
results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
|
202 |
|
|
|
|
|
203 |
# Out file
|
204 |
results_df_name = "search_result.csv"
|
205 |
results_df_out.to_csv(results_df_name, index= None)
|
|
|
243 |
df = read_file(in_file.name)
|
244 |
new_choices = list(df.columns)
|
245 |
|
246 |
+
#print(new_choices)
|
247 |
|
248 |
concat_choices.extend(new_choices)
|
249 |
|
|
|
295 |
# Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
|
296 |
|
297 |
#else:
|
298 |
+
embeddings_func = AutoModel.from_pretrained(embeddings_name, trust_remote_code=True, device_map="auto")
|
299 |
|
300 |
global embeddings
|
301 |
|
|
|
304 |
return embeddings
|
305 |
|
306 |
# Load embeddings
|
307 |
+
#embeddings_name =
|
308 |
+
embeddings_model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-small-en", trust_remote_code=True, device_map="auto")
|
309 |
+
#embeddings_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
|
310 |
+
#embeddings_model = SentenceTransformer("paraphrase-MiniLM-L3-v2")
|
311 |
+
|
312 |
+
embeddings = embeddings_model#load_embeddings(embeddings_name)
|
313 |
|
314 |
def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress()):
|
315 |
'''
|
|
|
318 |
|
319 |
print(f"> Total split documents: {len(docs_out)}")
|
320 |
|
321 |
+
print(docs_out)
|
322 |
|
323 |
page_contents = [doc.page_content for doc in docs_out]
|
324 |
page_meta = [doc.metadata for doc in docs_out]
|
325 |
ids_range = range(0,len(page_contents))
|
326 |
ids = [str(element) for element in ids_range]
|
327 |
|
328 |
+
tic = time.perf_counter()
|
329 |
+
#embeddings_list = []
|
330 |
+
#for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
|
331 |
+
# embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
|
332 |
+
|
333 |
+
embeddings_list = embeddings.encode(sentences=page_contents, max_length=256).tolist() # For Jina embeddings
|
334 |
+
#embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
|
335 |
+
#embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
|
336 |
+
|
337 |
+
toc = time.perf_counter()
|
338 |
+
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
339 |
+
|
340 |
+
# Jina tiny
|
341 |
+
# This takes about 300 seconds for 240,000 records = 800 / second, 1024 max length
|
342 |
+
# For 50k records:
|
343 |
+
# 61 seconds at 1024 max length
|
344 |
+
# 55 seconds at 512 max length
|
345 |
+
# 43 seconds at 256 max length
|
346 |
+
# 31 seconds at 128 max length
|
347 |
+
|
348 |
+
# BGE small
|
349 |
+
# 96 seconds for 50k records at 512 length
|
350 |
+
|
351 |
+
# all-MiniLM-L6-v2
|
352 |
+
# 42.5 seconds at (256?) max length
|
353 |
+
|
354 |
+
# paraphrase-MiniLM-L3-v2
|
355 |
+
# 22 seconds for 128 max length
|
356 |
|
357 |
|
358 |
+
print(time_out)
|
359 |
+
|
360 |
+
chroma_tic = time.perf_counter()
|
361 |
+
|
362 |
+
# Create a new Chroma collection to store the documents and metadata. We don't need to specify an embedding fuction, and the default will be used.
|
363 |
+
client = chromadb.PersistentClient(path="./db", settings=Settings(
|
364 |
+
anonymized_telemetry=False))
|
365 |
|
|
|
366 |
try:
|
367 |
+
print("Deleting existing collection.")
|
368 |
+
#collection = client.get_collection(name="my_collection")
|
369 |
client.delete_collection(name="my_collection")
|
370 |
+
print("Creating new collection.")
|
371 |
+
collection = client.create_collection(name="my_collection")
|
372 |
except:
|
373 |
+
print("Creating new collection.")
|
374 |
collection = client.create_collection(name="my_collection")
|
375 |
+
|
376 |
+
# Match batch size is about 40,000, so add that amount in a loop
|
377 |
+
def create_batch_ranges(in_list, batch_size=40000):
|
378 |
+
total_rows = len(in_list)
|
379 |
+
ranges = []
|
380 |
+
|
381 |
+
for start in range(0, total_rows, batch_size):
|
382 |
+
end = min(start + batch_size, total_rows)
|
383 |
+
ranges.append(range(start, end))
|
384 |
+
|
385 |
+
return ranges
|
386 |
+
|
387 |
+
batch_ranges = create_batch_ranges(embeddings_list)
|
388 |
+
print(batch_ranges)
|
389 |
+
|
390 |
+
for row_range in progress.tqdm(batch_ranges, desc = "Creating vector database", unit = "batches of 40,000 rows"):
|
391 |
+
|
392 |
+
collection.add(
|
393 |
+
documents = page_contents[row_range[0]:row_range[-1]],
|
394 |
+
embeddings = embeddings_list[row_range[0]:row_range[-1]],
|
395 |
+
metadatas = page_meta[row_range[0]:row_range[-1]],
|
396 |
+
ids = ids[row_range[0]:row_range[-1]])
|
397 |
+
|
398 |
+
print(collection.count())
|
399 |
|
400 |
#chatf.vectorstore = vectorstore_func
|
401 |
|
402 |
+
chroma_toc = time.perf_counter()
|
403 |
+
|
404 |
+
chroma_time_out = f"Loading to Chroma db took {chroma_toc - chroma_tic:0.1f} seconds"
|
405 |
+
print(chroma_time_out)
|
406 |
+
|
407 |
out_message = "Document processing complete"
|
408 |
|
409 |
return out_message, collection
|
|
|
456 |
#df_docs = df#.apply(lambda x: x.explode()).reset_index(drop=True)
|
457 |
|
458 |
# Keep only documents with a certain score
|
459 |
+
|
460 |
+
print(df_docs)
|
461 |
|
462 |
docs_scores = df_docs["distances"] #.astype(float)
|
463 |
|
464 |
# Only keep sources that are sufficiently relevant (i.e. similarity search score below threshold below)
|
465 |
score_more_limit = df_docs.loc[docs_scores < vec_score_cut_off, :]
|
466 |
+
#docs_keep = create_docs_keep_from_df(score_more_limit) #list(compress(docs, score_more_limit))
|
467 |
|
468 |
#print(docs_keep)
|
469 |
|
470 |
+
if score_more_limit.empty:
|
471 |
+
return 'No result found!', None
|
472 |
|
473 |
# Only keep sources that are at least 100 characters long
|
474 |
docs_len = score_more_limit["documents"].str.len() >= 100
|
475 |
+
|
476 |
+
print(docs_len)
|
477 |
+
|
478 |
+
length_more_limit = score_more_limit.loc[docs_len == True, :] #pd.Series(docs_len) >= 100
|
479 |
+
#docs_keep = create_docs_keep_from_df(length_more_limit) #list(compress(docs_keep, length_more_limit))
|
480 |
|
481 |
#print(length_more_limit)
|
482 |
|
483 |
+
if length_more_limit.empty:
|
484 |
+
return 'No result found!', None
|
485 |
|
486 |
length_more_limit['ids'] = length_more_limit['ids'].astype(int)
|
487 |
|
488 |
#length_more_limit.to_csv("length_more_limit.csv", index = None)
|
489 |
|
490 |
# Explode the 'metadatas' dictionary into separate columns
|
491 |
+
df_metadata_expanded = length_more_limit['metadatas'].apply(pd.Series)
|
492 |
+
|
493 |
+
print(length_more_limit)
|
494 |
+
print(df_metadata_expanded)
|
495 |
|
496 |
# Concatenate the original DataFrame with the expanded metadata DataFrame
|
497 |
+
results_df_out = pd.concat([length_more_limit.drop('metadatas', axis=1), df_metadata_expanded], axis=1)
|
498 |
|
499 |
results_df_out = results_df_out.rename(columns={"documents":orig_df_col})
|
500 |
|
|
|
511 |
# Import data
|
512 |
join_df = read_file(join_filename)
|
513 |
join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
|
514 |
+
|
515 |
+
# Duplicates dropped so as not to expand out dataframe
|
516 |
+
join_df = join_df.drop_duplicates(in_join_column)
|
517 |
+
|
518 |
results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
|
519 |
|
520 |
results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1)
|
|
|
522 |
|
523 |
results_df_name = "semantic_search_result.csv"
|
524 |
results_df_out.to_csv(results_df_name, index= None)
|
525 |
+
results_first_text = results_df_out[orig_df_col].iloc[0]
|
526 |
|
527 |
return results_first_text, results_df_name
|
528 |
|
|
|
539 |
|
540 |
k_val = gr.State(9999)
|
541 |
out_passages = gr.State(9999)
|
542 |
+
vec_score_cut_off = gr.State(70)
|
543 |
vec_weight = gr.State(1)
|
544 |
|
545 |
docs_keep_as_doc_state = gr.State()
|
|
|
599 |
with gr.Accordion("Load in data", open = True):
|
600 |
in_semantic_file = gr.File(label="Upload data file for semantic search")
|
601 |
in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
|
602 |
+
load_semantic_data_button = gr.Button(value="Load in data file", variant="secondary", scale=0)
|
603 |
|
604 |
ingest_embed_out = gr.Textbox(label="File/web page preparation progress")
|
605 |
semantic_query = gr.Textbox(label="Enter semantic search query here")
|
|
|
659 |
# Load in a csv/excel file for semantic search
|
660 |
in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column])
|
661 |
load_semantic_data_button.click(ing.parse_csv_or_excel, inputs=[in_semantic_file, in_semantic_column], outputs=[ingest_text, current_source_semantic]).\
|
662 |
+
then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_column], outputs=[ingest_docs, load_finished_message]).\
|
663 |
then(docs_to_chroma_save, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state])
|
664 |
|
665 |
# Semantic search query
|
requirements.txt
CHANGED
@@ -6,4 +6,6 @@ transformers
|
|
6 |
langchain
|
7 |
chromadb
|
8 |
torch
|
|
|
|
|
9 |
gradio==3.50.0
|
|
|
6 |
langchain
|
7 |
chromadb
|
8 |
torch
|
9 |
+
accelerate
|
10 |
+
sentence-transformers
|
11 |
gradio==3.50.0
|
search_funcs/ingest.py
CHANGED
@@ -1,39 +1,40 @@
|
|
1 |
-
#
|
2 |
-
# jupyter:
|
3 |
-
# jupytext:
|
4 |
-
# formats: ipynb,py:light
|
5 |
-
# text_representation:
|
6 |
-
# extension: .py
|
7 |
-
# format_name: light
|
8 |
-
# format_version: '1.5'
|
9 |
-
# jupytext_version: 1.14.6
|
10 |
-
# kernelspec:
|
11 |
-
# display_name: Python 3 (ipykernel)
|
12 |
-
# language: python
|
13 |
-
# name: python3
|
14 |
-
# ---
|
15 |
-
|
16 |
-
# # Ingest website to FAISS
|
17 |
-
|
18 |
-
# ## Install/ import stuff we need
|
19 |
|
20 |
import os
|
21 |
-
|
22 |
import re
|
23 |
import pandas as pd
|
24 |
-
|
|
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
-
#from bs4 import BeautifulSoup
|
33 |
-
#from docx import Document as Doc
|
34 |
-
#from pypdf import PdfReader
|
35 |
|
36 |
-
PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
|
37 |
# -
|
38 |
|
39 |
split_strat = ["\n\n", "\n", ". ", "! ", "? "]
|
@@ -79,7 +80,8 @@ def parse_file(file_paths, text_column='text'):
|
|
79 |
# '.html': parse_html,
|
80 |
# '.htm': parse_html, # Considering both .html and .htm for HTML files
|
81 |
'.csv': lambda file_path: parse_csv_or_excel(file_path, text_column),
|
82 |
-
'.xlsx': lambda file_path: parse_csv_or_excel(file_path, text_column)
|
|
|
83 |
}
|
84 |
|
85 |
parsed_contents = {}
|
@@ -145,35 +147,16 @@ def parse_csv_or_excel(file_path, text_column = "text"):
|
|
145 |
if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
|
146 |
df['source'] = file_name
|
147 |
df['page_section'] = ""
|
|
|
|
|
|
|
|
|
|
|
148 |
else:
|
149 |
print(f"Unsupported file type: {file_extension}")
|
150 |
return pd.DataFrame(), ['Please choose a valid file type']
|
151 |
-
|
152 |
-
# file_names.append(file_name)
|
153 |
-
# out_df = pd.concat([out_df, df])
|
154 |
-
|
155 |
-
#if text_column not in df.columns:
|
156 |
-
# return f"Column '{text_column}' not found in {file_path}"
|
157 |
-
#text_out = " ".join(df[text_column].dropna().astype(str))
|
158 |
return df, file_names
|
159 |
|
160 |
-
def parse_excel(file_path, text_column):
|
161 |
-
"""
|
162 |
-
Read text from an Excel file.
|
163 |
-
|
164 |
-
Parameters:
|
165 |
-
file_path (str): Path to the Excel file.
|
166 |
-
text_column (str): Name of the column in the Excel file that contains the text content.
|
167 |
-
|
168 |
-
Returns:
|
169 |
-
Pandas DataFrame: Dataframe output from file read
|
170 |
-
"""
|
171 |
-
df = pd.read_excel(file_path, engine='openpyxl')
|
172 |
-
#if text_column not in df.columns:
|
173 |
-
# return f"Column '{text_column}' not found in {file_path}"
|
174 |
-
#text_out = " ".join(df[text_column].dropna().astype(str))
|
175 |
-
return df
|
176 |
-
|
177 |
def get_file_path_end(file_path):
|
178 |
match = re.search(r'(.*[\/\\])?(.+)$', file_path)
|
179 |
|
@@ -232,6 +215,21 @@ def write_out_metadata_as_string(metadata_in):
|
|
232 |
metadata_string = [f"{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
|
233 |
return metadata_string
|
234 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]:
|
236 |
"""Converts a DataFrame's content to a list of Documents with metadata."""
|
237 |
|
@@ -249,7 +247,7 @@ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Docu
|
|
249 |
if col != text_column:
|
250 |
metadata[col] = value
|
251 |
|
252 |
-
metadata_string = write_out_metadata_as_string(metadata)[0]
|
253 |
|
254 |
# If chunk_size is provided, split the text into chunks
|
255 |
if chunk_size:
|
@@ -275,6 +273,39 @@ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Docu
|
|
275 |
|
276 |
return doc_sections
|
277 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
278 |
# # Functions for working with documents after loading them back in
|
279 |
|
280 |
def pull_out_data(series):
|
@@ -331,85 +362,3 @@ def docs_elements_from_csv_save(docs_path="documents.csv"):
|
|
331 |
doc_sources = [d['source'] for d in docs_meta]
|
332 |
|
333 |
return out_df, docs_content, docs_meta, doc_sources
|
334 |
-
|
335 |
-
# ## Create embeddings and save faiss vector store to the path specified in `save_to`
|
336 |
-
|
337 |
-
def load_embeddings(model_name = "BAAI/bge-base-en-v1.5"):
|
338 |
-
|
339 |
-
#if model_name == "hkunlp/instructor-large":
|
340 |
-
# embeddings_func = HuggingFaceInstructEmbeddings(model_name=model_name,
|
341 |
-
# embed_instruction="Represent the paragraph for retrieval: ",
|
342 |
-
# query_instruction="Represent the question for retrieving supporting documents: "
|
343 |
-
# )
|
344 |
-
|
345 |
-
#else:
|
346 |
-
embeddings_func = HuggingFaceEmbeddings(model_name=model_name)
|
347 |
-
|
348 |
-
global embeddings
|
349 |
-
|
350 |
-
embeddings = embeddings_func
|
351 |
-
|
352 |
-
return embeddings_func
|
353 |
-
|
354 |
-
def embed_faiss_save_to_zip(docs_out, save_to="faiss_lambeth_census_embedding", model_name = "BAAI/bge-base-en-v1.5"):
|
355 |
-
|
356 |
-
load_embeddings(model_name=model_name)
|
357 |
-
|
358 |
-
#embeddings_fast = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
359 |
-
|
360 |
-
print(f"> Total split documents: {len(docs_out)}")
|
361 |
-
|
362 |
-
vectorstore = FAISS.from_documents(documents=docs_out, embedding=embeddings)
|
363 |
-
|
364 |
-
|
365 |
-
if Path(save_to).exists():
|
366 |
-
vectorstore.save_local(folder_path=save_to)
|
367 |
-
|
368 |
-
print("> DONE")
|
369 |
-
print(f"> Saved to: {save_to}")
|
370 |
-
|
371 |
-
### Save as zip, then remove faiss/pkl files to allow for upload to huggingface
|
372 |
-
|
373 |
-
import shutil
|
374 |
-
|
375 |
-
shutil.make_archive(save_to, 'zip', save_to)
|
376 |
-
|
377 |
-
os.remove(save_to + "/index.faiss")
|
378 |
-
os.remove(save_to + "/index.pkl")
|
379 |
-
|
380 |
-
shutil.move(save_to + '.zip', save_to + "/" + save_to + '.zip')
|
381 |
-
|
382 |
-
return vectorstore
|
383 |
-
|
384 |
-
def docs_to_chroma_save(embeddings, docs_out:PandasDataFrame, save_to:str):
|
385 |
-
print(f"> Total split documents: {len(docs_out)}")
|
386 |
-
|
387 |
-
vectordb = Chroma.from_documents(documents=docs_out,
|
388 |
-
embedding=embeddings,
|
389 |
-
persist_directory=save_to)
|
390 |
-
|
391 |
-
# persiste the db to disk
|
392 |
-
vectordb.persist()
|
393 |
-
|
394 |
-
print("> DONE")
|
395 |
-
print(f"> Saved to: {save_to}")
|
396 |
-
|
397 |
-
return vectordb
|
398 |
-
|
399 |
-
def sim_search_local_saved_vec(query, k_val, save_to="faiss_lambeth_census_embedding"):
|
400 |
-
|
401 |
-
load_embeddings()
|
402 |
-
|
403 |
-
docsearch = FAISS.load_local(folder_path=save_to, embeddings=embeddings)
|
404 |
-
|
405 |
-
|
406 |
-
display(Markdown(question))
|
407 |
-
|
408 |
-
search = docsearch.similarity_search_with_score(query, k=k_val)
|
409 |
-
|
410 |
-
for item in search:
|
411 |
-
print(item[0].page_content)
|
412 |
-
print(f"Page: {item[0].metadata['source']}")
|
413 |
-
print(f"Date: {item[0].metadata['date']}")
|
414 |
-
print(f"Score: {item[1]}")
|
415 |
-
print("---")
|
|
|
1 |
+
# Install/ import stuff we need
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
import os
|
4 |
+
import time
|
5 |
import re
|
6 |
import pandas as pd
|
7 |
+
import gradio as gr
|
8 |
+
from typing import Type, List, Literal
|
9 |
|
10 |
+
from pydantic import BaseModel, Field
|
11 |
+
|
12 |
+
# Creating an alias for pandas DataFrame using Type
|
13 |
+
PandasDataFrame = Type[pd.DataFrame]
|
14 |
+
|
15 |
+
# class Document(BaseModel):
|
16 |
+
# """Class for storing a piece of text and associated metadata. Implementation adapted from Langchain code: https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py"""
|
17 |
+
|
18 |
+
# page_content: str
|
19 |
+
# """String text."""
|
20 |
+
# metadata: dict = Field(default_factory=dict)
|
21 |
+
# """Arbitrary metadata about the page content (e.g., source, relationships to other
|
22 |
+
# documents, etc.).
|
23 |
+
# """
|
24 |
+
# type: Literal["Document"] = "Document"
|
25 |
+
|
26 |
+
class Document(BaseModel):
|
27 |
+
"""Class for storing a piece of text and associated metadata. Implementation adapted from Langchain code: https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py"""
|
28 |
+
|
29 |
+
page_content: str
|
30 |
+
"""String text."""
|
31 |
+
metadata: dict = Field(default_factory=dict)
|
32 |
+
"""Arbitrary metadata about the page content (e.g., source, relationships to other
|
33 |
+
documents, etc.).
|
34 |
+
"""
|
35 |
+
type: Literal["Document"] = "Document"
|
36 |
|
|
|
|
|
|
|
37 |
|
|
|
38 |
# -
|
39 |
|
40 |
split_strat = ["\n\n", "\n", ". ", "! ", "? "]
|
|
|
80 |
# '.html': parse_html,
|
81 |
# '.htm': parse_html, # Considering both .html and .htm for HTML files
|
82 |
'.csv': lambda file_path: parse_csv_or_excel(file_path, text_column),
|
83 |
+
'.xlsx': lambda file_path: parse_csv_or_excel(file_path, text_column),
|
84 |
+
'.parquet': lambda file_path: parse_csv_or_excel(file_path, text_column)
|
85 |
}
|
86 |
|
87 |
parsed_contents = {}
|
|
|
147 |
if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
|
148 |
df['source'] = file_name
|
149 |
df['page_section'] = ""
|
150 |
+
elif file_extension == ".parquet":
|
151 |
+
df = pd.read_parquet(file_path.name)
|
152 |
+
if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
|
153 |
+
df['source'] = file_name
|
154 |
+
df['page_section'] = ""
|
155 |
else:
|
156 |
print(f"Unsupported file type: {file_extension}")
|
157 |
return pd.DataFrame(), ['Please choose a valid file type']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
return df, file_names
|
159 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
def get_file_path_end(file_path):
|
161 |
match = re.search(r'(.*[\/\\])?(.+)$', file_path)
|
162 |
|
|
|
215 |
metadata_string = [f"{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
|
216 |
return metadata_string
|
217 |
|
218 |
+
def combine_metadata_columns(df, cols):
|
219 |
+
|
220 |
+
df['metadatas'] = "{"
|
221 |
+
df['blank_column'] = ""
|
222 |
+
|
223 |
+
for n, col in enumerate(cols):
|
224 |
+
df[col] = df[col].astype(str).str.replace('"',"'").str.cat(df['blank_column'].astype(str), sep="")
|
225 |
+
|
226 |
+
df['metadatas'] = df['metadatas'] + '"' + cols[n] + '": "' + df[col] + '", '
|
227 |
+
|
228 |
+
|
229 |
+
df['metadatas'] = (df['metadatas'] + "}").str.replace(", }", "}")
|
230 |
+
|
231 |
+
return df['metadatas']
|
232 |
+
|
233 |
def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]:
|
234 |
"""Converts a DataFrame's content to a list of Documents with metadata."""
|
235 |
|
|
|
247 |
if col != text_column:
|
248 |
metadata[col] = value
|
249 |
|
250 |
+
# metadata_string = write_out_metadata_as_string(metadata)[0]
|
251 |
|
252 |
# If chunk_size is provided, split the text into chunks
|
253 |
if chunk_size:
|
|
|
273 |
|
274 |
return doc_sections
|
275 |
|
276 |
+
import ast
|
277 |
+
|
278 |
+
def csv_excel_text_to_docs(df, text_column='text', chunk_size=None, progress=gr.Progress()) -> List[Document]:
|
279 |
+
"""Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
|
280 |
+
|
281 |
+
ingest_tic = time.perf_counter()
|
282 |
+
|
283 |
+
doc_sections = []
|
284 |
+
df[text_column] = df[text_column].astype(str).str.strip() # Ensure column is a string column
|
285 |
+
|
286 |
+
cols = [col for col in df.columns if col != text_column]
|
287 |
+
|
288 |
+
df["metadata"] = combine_metadata_columns(df, cols)
|
289 |
+
|
290 |
+
df = df.rename(columns={text_column:"page_content"})
|
291 |
+
|
292 |
+
#print(df[["page_content", "metadata"]].to_dict(orient='records'))
|
293 |
+
|
294 |
+
#doc_sections = df[["page_content", "metadata"]].to_dict(orient='records')
|
295 |
+
#doc_sections = [Document(**row) for row in df[["page_content", "metadata"]].to_dict(orient='records')]
|
296 |
+
|
297 |
+
# Create a list of Document objects
|
298 |
+
doc_sections = [Document(page_content=row['page_content'],
|
299 |
+
metadata= ast.literal_eval(row["metadata"]))
|
300 |
+
for index, row in progress.tqdm(df.iterrows(), desc = "Splitting up text", unit = "rows")]
|
301 |
+
|
302 |
+
ingest_toc = time.perf_counter()
|
303 |
+
|
304 |
+
ingest_time_out = f"Preparing documents took {ingest_toc - ingest_tic:0.1f} seconds"
|
305 |
+
print(ingest_time_out)
|
306 |
+
|
307 |
+
return doc_sections, "Finished splitting documents"
|
308 |
+
|
309 |
# # Functions for working with documents after loading them back in
|
310 |
|
311 |
def pull_out_data(series):
|
|
|
362 |
doc_sources = [d['source'] for d in docs_meta]
|
363 |
|
364 |
return out_df, docs_content, docs_meta, doc_sources
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|