seanpedrickcase commited on
Commit
2cb9977
1 Parent(s): 2a8aba8

Faster embedding with GPU, fast document split, writes to chromadb file correctly. No longer needs FAISS or langchain

Browse files
Files changed (4) hide show
  1. .gitignore +2 -1
  2. app.py +127 -40
  3. requirements.txt +2 -0
  4. search_funcs/ingest.py +86 -137
.gitignore CHANGED
@@ -13,4 +13,5 @@
13
  *.ipynb
14
  build/*
15
  dist/*
16
- __pycache__/*
 
 
13
  *.ipynb
14
  build/*
15
  dist/*
16
+ __pycache__/*
17
+ db/*
app.py CHANGED
@@ -8,27 +8,43 @@ nltk.download('punkt')
8
  from search_funcs.fast_bm25 import BM25
9
  from search_funcs.clean_funcs import initial_clean, get_lemma_tokens#, stem_sentence
10
  from nltk import word_tokenize
 
11
 
12
  PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
13
 
14
  import gradio as gr
15
  import pandas as pd
16
  import os
 
 
17
 
18
- from itertools import compress
19
-
20
- #from langchain.embeddings import HuggingFaceEmbeddings
21
- #from langchain.vectorstores import FAISS
22
  from transformers import AutoModel
23
 
 
 
 
 
 
24
  import search_funcs.ingest as ing
25
- import search_funcs.chatfuncs as chatf
26
 
27
  # Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
28
  import chromadb
29
  #from typing_extensions import Protocol
30
  #from chromadb import Documents, EmbeddingFunction, Embeddings
31
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  # Remove Chroma database file. If it exists as it can cause issues
33
  chromadb_file = "chroma.sqlite3"
34
 
@@ -176,14 +192,14 @@ def bm25_search(free_text_query, in_no_search_results, original_data, text_colum
176
  join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
177
  results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
178
 
 
 
 
179
  results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1)
180
 
181
-
182
  # Reorder results by score
183
  results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
184
 
185
-
186
-
187
  # Out file
188
  results_df_name = "search_result.csv"
189
  results_df_out.to_csv(results_df_name, index= None)
@@ -227,7 +243,7 @@ def put_columns_in_df(in_file, in_bm25_column):
227
  df = read_file(in_file.name)
228
  new_choices = list(df.columns)
229
 
230
- print(new_choices)
231
 
232
  concat_choices.extend(new_choices)
233
 
@@ -279,7 +295,7 @@ def load_embeddings(embeddings_name = "jinaai/jina-embeddings-v2-small-en"):
279
  # Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
280
 
281
  #else:
282
- embeddings_func = AutoModel.from_pretrained(embeddings_name, trust_remote_code=True)
283
 
284
  global embeddings
285
 
@@ -288,10 +304,12 @@ def load_embeddings(embeddings_name = "jinaai/jina-embeddings-v2-small-en"):
288
  return embeddings
289
 
290
  # Load embeddings
291
- embeddings_name = "jinaai/jina-embeddings-v2-small-en"
292
- #embeddings_name = "BAAI/bge-base-en-v1.5"
293
- embeddings_model = AutoModel.from_pretrained(embeddings_name, trust_remote_code=True)
294
- embeddings = load_embeddings(embeddings_name)
 
 
295
 
296
  def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress()):
297
  '''
@@ -300,35 +318,92 @@ def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress(
300
 
301
  print(f"> Total split documents: {len(docs_out)}")
302
 
303
- #print(docs_out)
304
 
305
  page_contents = [doc.page_content for doc in docs_out]
306
  page_meta = [doc.metadata for doc in docs_out]
307
  ids_range = range(0,len(page_contents))
308
  ids = [str(element) for element in ids_range]
309
 
310
- embeddings_list = []
311
- for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
312
- embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
 
314
 
315
- client = chromadb.PersistentClient(path=".")
 
 
 
 
 
 
316
 
317
- # Create a new Chroma collection to store the supporting evidence. We don't need to specify an embedding fuction, and the default will be used.
318
  try:
319
- collection = client.get_collection(name="my_collection")
 
320
  client.delete_collection(name="my_collection")
 
 
321
  except:
 
322
  collection = client.create_collection(name="my_collection")
323
-
324
- collection.add(
325
- documents = page_contents,
326
- embeddings = embeddings_list,
327
- metadatas = page_meta,
328
- ids = ids)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
 
330
  #chatf.vectorstore = vectorstore_func
331
 
 
 
 
 
 
332
  out_message = "Document processing complete"
333
 
334
  return out_message, collection
@@ -381,37 +456,45 @@ def chroma_retrieval(new_question_kworded:str, vectorstore, docs, orig_df_col:st
381
  #df_docs = df#.apply(lambda x: x.explode()).reset_index(drop=True)
382
 
383
  # Keep only documents with a certain score
 
 
384
 
385
  docs_scores = df_docs["distances"] #.astype(float)
386
 
387
  # Only keep sources that are sufficiently relevant (i.e. similarity search score below threshold below)
388
  score_more_limit = df_docs.loc[docs_scores < vec_score_cut_off, :]
389
- docs_keep = create_docs_keep_from_df(score_more_limit) #list(compress(docs, score_more_limit))
390
 
391
  #print(docs_keep)
392
 
393
- if not docs_keep:
394
- return 'No result found!', ""
395
 
396
  # Only keep sources that are at least 100 characters long
397
  docs_len = score_more_limit["documents"].str.len() >= 100
398
- length_more_limit = score_more_limit.loc[docs_len, :] #pd.Series(docs_len) >= 100
399
- docs_keep = create_docs_keep_from_df(length_more_limit) #list(compress(docs_keep, length_more_limit))
 
 
 
400
 
401
  #print(length_more_limit)
402
 
403
- if not docs_keep:
404
- return 'No result found!', ""
405
 
406
  length_more_limit['ids'] = length_more_limit['ids'].astype(int)
407
 
408
  #length_more_limit.to_csv("length_more_limit.csv", index = None)
409
 
410
  # Explode the 'metadatas' dictionary into separate columns
411
- df_metadata_expanded = df_docs['metadatas'].apply(pd.Series)
 
 
 
412
 
413
  # Concatenate the original DataFrame with the expanded metadata DataFrame
414
- results_df_out = pd.concat([df_docs.drop('metadatas', axis=1), df_metadata_expanded], axis=1)
415
 
416
  results_df_out = results_df_out.rename(columns={"documents":orig_df_col})
417
 
@@ -428,6 +511,10 @@ def chroma_retrieval(new_question_kworded:str, vectorstore, docs, orig_df_col:st
428
  # Import data
429
  join_df = read_file(join_filename)
430
  join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
 
 
 
 
431
  results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
432
 
433
  results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1)
@@ -435,7 +522,7 @@ def chroma_retrieval(new_question_kworded:str, vectorstore, docs, orig_df_col:st
435
 
436
  results_df_name = "semantic_search_result.csv"
437
  results_df_out.to_csv(results_df_name, index= None)
438
- results_first_text = results_df_out[orig_df_col][0]
439
 
440
  return results_first_text, results_df_name
441
 
@@ -452,7 +539,7 @@ with block:
452
 
453
  k_val = gr.State(9999)
454
  out_passages = gr.State(9999)
455
- vec_score_cut_off = gr.State(100)
456
  vec_weight = gr.State(1)
457
 
458
  docs_keep_as_doc_state = gr.State()
@@ -512,7 +599,7 @@ depends on factors such as the type of documents or queries. Information taken f
512
  with gr.Accordion("Load in data", open = True):
513
  in_semantic_file = gr.File(label="Upload data file for semantic search")
514
  in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
515
- load_semantic_data_button = gr.Button(value="Load in CSV/Excel file", variant="secondary", scale=0)
516
 
517
  ingest_embed_out = gr.Textbox(label="File/web page preparation progress")
518
  semantic_query = gr.Textbox(label="Enter semantic search query here")
@@ -572,7 +659,7 @@ depends on factors such as the type of documents or queries. Information taken f
572
  # Load in a csv/excel file for semantic search
573
  in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column])
574
  load_semantic_data_button.click(ing.parse_csv_or_excel, inputs=[in_semantic_file, in_semantic_column], outputs=[ingest_text, current_source_semantic]).\
575
- then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_column], outputs=[ingest_docs]).\
576
  then(docs_to_chroma_save, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state])
577
 
578
  # Semantic search query
 
8
  from search_funcs.fast_bm25 import BM25
9
  from search_funcs.clean_funcs import initial_clean, get_lemma_tokens#, stem_sentence
10
  from nltk import word_tokenize
11
+ #from sentence_transformers import SentenceTransformer
12
 
13
  PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
14
 
15
  import gradio as gr
16
  import pandas as pd
17
  import os
18
+ import time
19
+ from chromadb.config import Settings
20
 
 
 
 
 
21
  from transformers import AutoModel
22
 
23
+ # model = AutoModel.from_pretrained('./model_and_tokenizer/int8-model.onnx', use_embedding_runtime=True)
24
+ # sentence_embeddings = model.generate(engine_input)['last_hidden_state:0']
25
+
26
+ # print("Sentence embeddings:", sentence_embeddings)
27
+
28
  import search_funcs.ingest as ing
29
+ #import search_funcs.chatfuncs as chatf
30
 
31
  # Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
32
  import chromadb
33
  #from typing_extensions import Protocol
34
  #from chromadb import Documents, EmbeddingFunction, Embeddings
35
 
36
+ from torch import cuda, backends
37
+
38
+ # Check for torch cuda
39
+ print(cuda.is_available())
40
+ print(backends.cudnn.enabled)
41
+ if cuda.is_available():
42
+ torch_device = "cuda"
43
+ os.system("nvidia-smi")
44
+
45
+ else:
46
+ torch_device = "cpu"
47
+
48
  # Remove Chroma database file. If it exists as it can cause issues
49
  chromadb_file = "chroma.sqlite3"
50
 
 
192
  join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
193
  results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
194
 
195
+ # Duplicates dropped so as not to expand out dataframe
196
+ join_df = join_df.drop_duplicates(in_join_column)
197
+
198
  results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1)
199
 
 
200
  # Reorder results by score
201
  results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
202
 
 
 
203
  # Out file
204
  results_df_name = "search_result.csv"
205
  results_df_out.to_csv(results_df_name, index= None)
 
243
  df = read_file(in_file.name)
244
  new_choices = list(df.columns)
245
 
246
+ #print(new_choices)
247
 
248
  concat_choices.extend(new_choices)
249
 
 
295
  # Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
296
 
297
  #else:
298
+ embeddings_func = AutoModel.from_pretrained(embeddings_name, trust_remote_code=True, device_map="auto")
299
 
300
  global embeddings
301
 
 
304
  return embeddings
305
 
306
  # Load embeddings
307
+ #embeddings_name =
308
+ embeddings_model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-small-en", trust_remote_code=True, device_map="auto")
309
+ #embeddings_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
310
+ #embeddings_model = SentenceTransformer("paraphrase-MiniLM-L3-v2")
311
+
312
+ embeddings = embeddings_model#load_embeddings(embeddings_name)
313
 
314
  def docs_to_chroma_save(docs_out, embeddings = embeddings, progress=gr.Progress()):
315
  '''
 
318
 
319
  print(f"> Total split documents: {len(docs_out)}")
320
 
321
+ print(docs_out)
322
 
323
  page_contents = [doc.page_content for doc in docs_out]
324
  page_meta = [doc.metadata for doc in docs_out]
325
  ids_range = range(0,len(page_contents))
326
  ids = [str(element) for element in ids_range]
327
 
328
+ tic = time.perf_counter()
329
+ #embeddings_list = []
330
+ #for page in progress.tqdm(page_contents, desc = "Preparing search index", unit = "rows"):
331
+ # embeddings_list.append(embeddings.encode(sentences=page, max_length=1024).tolist())
332
+
333
+ embeddings_list = embeddings.encode(sentences=page_contents, max_length=256).tolist() # For Jina embeddings
334
+ #embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
335
+ #embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
336
+
337
+ toc = time.perf_counter()
338
+ time_out = f"The embedding took {toc - tic:0.1f} seconds"
339
+
340
+ # Jina tiny
341
+ # This takes about 300 seconds for 240,000 records = 800 / second, 1024 max length
342
+ # For 50k records:
343
+ # 61 seconds at 1024 max length
344
+ # 55 seconds at 512 max length
345
+ # 43 seconds at 256 max length
346
+ # 31 seconds at 128 max length
347
+
348
+ # BGE small
349
+ # 96 seconds for 50k records at 512 length
350
+
351
+ # all-MiniLM-L6-v2
352
+ # 42.5 seconds at (256?) max length
353
+
354
+ # paraphrase-MiniLM-L3-v2
355
+ # 22 seconds for 128 max length
356
 
357
 
358
+ print(time_out)
359
+
360
+ chroma_tic = time.perf_counter()
361
+
362
+ # Create a new Chroma collection to store the documents and metadata. We don't need to specify an embedding fuction, and the default will be used.
363
+ client = chromadb.PersistentClient(path="./db", settings=Settings(
364
+ anonymized_telemetry=False))
365
 
 
366
  try:
367
+ print("Deleting existing collection.")
368
+ #collection = client.get_collection(name="my_collection")
369
  client.delete_collection(name="my_collection")
370
+ print("Creating new collection.")
371
+ collection = client.create_collection(name="my_collection")
372
  except:
373
+ print("Creating new collection.")
374
  collection = client.create_collection(name="my_collection")
375
+
376
+ # Match batch size is about 40,000, so add that amount in a loop
377
+ def create_batch_ranges(in_list, batch_size=40000):
378
+ total_rows = len(in_list)
379
+ ranges = []
380
+
381
+ for start in range(0, total_rows, batch_size):
382
+ end = min(start + batch_size, total_rows)
383
+ ranges.append(range(start, end))
384
+
385
+ return ranges
386
+
387
+ batch_ranges = create_batch_ranges(embeddings_list)
388
+ print(batch_ranges)
389
+
390
+ for row_range in progress.tqdm(batch_ranges, desc = "Creating vector database", unit = "batches of 40,000 rows"):
391
+
392
+ collection.add(
393
+ documents = page_contents[row_range[0]:row_range[-1]],
394
+ embeddings = embeddings_list[row_range[0]:row_range[-1]],
395
+ metadatas = page_meta[row_range[0]:row_range[-1]],
396
+ ids = ids[row_range[0]:row_range[-1]])
397
+
398
+ print(collection.count())
399
 
400
  #chatf.vectorstore = vectorstore_func
401
 
402
+ chroma_toc = time.perf_counter()
403
+
404
+ chroma_time_out = f"Loading to Chroma db took {chroma_toc - chroma_tic:0.1f} seconds"
405
+ print(chroma_time_out)
406
+
407
  out_message = "Document processing complete"
408
 
409
  return out_message, collection
 
456
  #df_docs = df#.apply(lambda x: x.explode()).reset_index(drop=True)
457
 
458
  # Keep only documents with a certain score
459
+
460
+ print(df_docs)
461
 
462
  docs_scores = df_docs["distances"] #.astype(float)
463
 
464
  # Only keep sources that are sufficiently relevant (i.e. similarity search score below threshold below)
465
  score_more_limit = df_docs.loc[docs_scores < vec_score_cut_off, :]
466
+ #docs_keep = create_docs_keep_from_df(score_more_limit) #list(compress(docs, score_more_limit))
467
 
468
  #print(docs_keep)
469
 
470
+ if score_more_limit.empty:
471
+ return 'No result found!', None
472
 
473
  # Only keep sources that are at least 100 characters long
474
  docs_len = score_more_limit["documents"].str.len() >= 100
475
+
476
+ print(docs_len)
477
+
478
+ length_more_limit = score_more_limit.loc[docs_len == True, :] #pd.Series(docs_len) >= 100
479
+ #docs_keep = create_docs_keep_from_df(length_more_limit) #list(compress(docs_keep, length_more_limit))
480
 
481
  #print(length_more_limit)
482
 
483
+ if length_more_limit.empty:
484
+ return 'No result found!', None
485
 
486
  length_more_limit['ids'] = length_more_limit['ids'].astype(int)
487
 
488
  #length_more_limit.to_csv("length_more_limit.csv", index = None)
489
 
490
  # Explode the 'metadatas' dictionary into separate columns
491
+ df_metadata_expanded = length_more_limit['metadatas'].apply(pd.Series)
492
+
493
+ print(length_more_limit)
494
+ print(df_metadata_expanded)
495
 
496
  # Concatenate the original DataFrame with the expanded metadata DataFrame
497
+ results_df_out = pd.concat([length_more_limit.drop('metadatas', axis=1), df_metadata_expanded], axis=1)
498
 
499
  results_df_out = results_df_out.rename(columns={"documents":orig_df_col})
500
 
 
511
  # Import data
512
  join_df = read_file(join_filename)
513
  join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
514
+
515
+ # Duplicates dropped so as not to expand out dataframe
516
+ join_df = join_df.drop_duplicates(in_join_column)
517
+
518
  results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
519
 
520
  results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1)
 
522
 
523
  results_df_name = "semantic_search_result.csv"
524
  results_df_out.to_csv(results_df_name, index= None)
525
+ results_first_text = results_df_out[orig_df_col].iloc[0]
526
 
527
  return results_first_text, results_df_name
528
 
 
539
 
540
  k_val = gr.State(9999)
541
  out_passages = gr.State(9999)
542
+ vec_score_cut_off = gr.State(70)
543
  vec_weight = gr.State(1)
544
 
545
  docs_keep_as_doc_state = gr.State()
 
599
  with gr.Accordion("Load in data", open = True):
600
  in_semantic_file = gr.File(label="Upload data file for semantic search")
601
  in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
602
+ load_semantic_data_button = gr.Button(value="Load in data file", variant="secondary", scale=0)
603
 
604
  ingest_embed_out = gr.Textbox(label="File/web page preparation progress")
605
  semantic_query = gr.Textbox(label="Enter semantic search query here")
 
659
  # Load in a csv/excel file for semantic search
660
  in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column])
661
  load_semantic_data_button.click(ing.parse_csv_or_excel, inputs=[in_semantic_file, in_semantic_column], outputs=[ingest_text, current_source_semantic]).\
662
+ then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_column], outputs=[ingest_docs, load_finished_message]).\
663
  then(docs_to_chroma_save, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state])
664
 
665
  # Semantic search query
requirements.txt CHANGED
@@ -6,4 +6,6 @@ transformers
6
  langchain
7
  chromadb
8
  torch
 
 
9
  gradio==3.50.0
 
6
  langchain
7
  chromadb
8
  torch
9
+ accelerate
10
+ sentence-transformers
11
  gradio==3.50.0
search_funcs/ingest.py CHANGED
@@ -1,39 +1,40 @@
1
- # ---
2
- # jupyter:
3
- # jupytext:
4
- # formats: ipynb,py:light
5
- # text_representation:
6
- # extension: .py
7
- # format_name: light
8
- # format_version: '1.5'
9
- # jupytext_version: 1.14.6
10
- # kernelspec:
11
- # display_name: Python 3 (ipykernel)
12
- # language: python
13
- # name: python3
14
- # ---
15
-
16
- # # Ingest website to FAISS
17
-
18
- # ## Install/ import stuff we need
19
 
20
  import os
21
- from pathlib import Path
22
  import re
23
  import pandas as pd
24
- from typing import TypeVar, List
 
25
 
26
- #from langchain.embeddings import HuggingFaceEmbeddings # HuggingFaceInstructEmbeddings,
27
- from langchain.vectorstores.faiss import FAISS
28
- from langchain.vectorstores import Chroma
29
- from langchain.text_splitter import RecursiveCharacterTextSplitter
30
- from langchain.docstore.document import Document
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- #from bs4 import BeautifulSoup
33
- #from docx import Document as Doc
34
- #from pypdf import PdfReader
35
 
36
- PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
37
  # -
38
 
39
  split_strat = ["\n\n", "\n", ". ", "! ", "? "]
@@ -79,7 +80,8 @@ def parse_file(file_paths, text_column='text'):
79
  # '.html': parse_html,
80
  # '.htm': parse_html, # Considering both .html and .htm for HTML files
81
  '.csv': lambda file_path: parse_csv_or_excel(file_path, text_column),
82
- '.xlsx': lambda file_path: parse_csv_or_excel(file_path, text_column)
 
83
  }
84
 
85
  parsed_contents = {}
@@ -145,35 +147,16 @@ def parse_csv_or_excel(file_path, text_column = "text"):
145
  if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
146
  df['source'] = file_name
147
  df['page_section'] = ""
 
 
 
 
 
148
  else:
149
  print(f"Unsupported file type: {file_extension}")
150
  return pd.DataFrame(), ['Please choose a valid file type']
151
-
152
- # file_names.append(file_name)
153
- # out_df = pd.concat([out_df, df])
154
-
155
- #if text_column not in df.columns:
156
- # return f"Column '{text_column}' not found in {file_path}"
157
- #text_out = " ".join(df[text_column].dropna().astype(str))
158
  return df, file_names
159
 
160
- def parse_excel(file_path, text_column):
161
- """
162
- Read text from an Excel file.
163
-
164
- Parameters:
165
- file_path (str): Path to the Excel file.
166
- text_column (str): Name of the column in the Excel file that contains the text content.
167
-
168
- Returns:
169
- Pandas DataFrame: Dataframe output from file read
170
- """
171
- df = pd.read_excel(file_path, engine='openpyxl')
172
- #if text_column not in df.columns:
173
- # return f"Column '{text_column}' not found in {file_path}"
174
- #text_out = " ".join(df[text_column].dropna().astype(str))
175
- return df
176
-
177
  def get_file_path_end(file_path):
178
  match = re.search(r'(.*[\/\\])?(.+)$', file_path)
179
 
@@ -232,6 +215,21 @@ def write_out_metadata_as_string(metadata_in):
232
  metadata_string = [f"{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
233
  return metadata_string
234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]:
236
  """Converts a DataFrame's content to a list of Documents with metadata."""
237
 
@@ -249,7 +247,7 @@ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Docu
249
  if col != text_column:
250
  metadata[col] = value
251
 
252
- metadata_string = write_out_metadata_as_string(metadata)[0]
253
 
254
  # If chunk_size is provided, split the text into chunks
255
  if chunk_size:
@@ -275,6 +273,39 @@ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Docu
275
 
276
  return doc_sections
277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  # # Functions for working with documents after loading them back in
279
 
280
  def pull_out_data(series):
@@ -331,85 +362,3 @@ def docs_elements_from_csv_save(docs_path="documents.csv"):
331
  doc_sources = [d['source'] for d in docs_meta]
332
 
333
  return out_df, docs_content, docs_meta, doc_sources
334
-
335
- # ## Create embeddings and save faiss vector store to the path specified in `save_to`
336
-
337
- def load_embeddings(model_name = "BAAI/bge-base-en-v1.5"):
338
-
339
- #if model_name == "hkunlp/instructor-large":
340
- # embeddings_func = HuggingFaceInstructEmbeddings(model_name=model_name,
341
- # embed_instruction="Represent the paragraph for retrieval: ",
342
- # query_instruction="Represent the question for retrieving supporting documents: "
343
- # )
344
-
345
- #else:
346
- embeddings_func = HuggingFaceEmbeddings(model_name=model_name)
347
-
348
- global embeddings
349
-
350
- embeddings = embeddings_func
351
-
352
- return embeddings_func
353
-
354
- def embed_faiss_save_to_zip(docs_out, save_to="faiss_lambeth_census_embedding", model_name = "BAAI/bge-base-en-v1.5"):
355
-
356
- load_embeddings(model_name=model_name)
357
-
358
- #embeddings_fast = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
359
-
360
- print(f"> Total split documents: {len(docs_out)}")
361
-
362
- vectorstore = FAISS.from_documents(documents=docs_out, embedding=embeddings)
363
-
364
-
365
- if Path(save_to).exists():
366
- vectorstore.save_local(folder_path=save_to)
367
-
368
- print("> DONE")
369
- print(f"> Saved to: {save_to}")
370
-
371
- ### Save as zip, then remove faiss/pkl files to allow for upload to huggingface
372
-
373
- import shutil
374
-
375
- shutil.make_archive(save_to, 'zip', save_to)
376
-
377
- os.remove(save_to + "/index.faiss")
378
- os.remove(save_to + "/index.pkl")
379
-
380
- shutil.move(save_to + '.zip', save_to + "/" + save_to + '.zip')
381
-
382
- return vectorstore
383
-
384
- def docs_to_chroma_save(embeddings, docs_out:PandasDataFrame, save_to:str):
385
- print(f"> Total split documents: {len(docs_out)}")
386
-
387
- vectordb = Chroma.from_documents(documents=docs_out,
388
- embedding=embeddings,
389
- persist_directory=save_to)
390
-
391
- # persiste the db to disk
392
- vectordb.persist()
393
-
394
- print("> DONE")
395
- print(f"> Saved to: {save_to}")
396
-
397
- return vectordb
398
-
399
- def sim_search_local_saved_vec(query, k_val, save_to="faiss_lambeth_census_embedding"):
400
-
401
- load_embeddings()
402
-
403
- docsearch = FAISS.load_local(folder_path=save_to, embeddings=embeddings)
404
-
405
-
406
- display(Markdown(question))
407
-
408
- search = docsearch.similarity_search_with_score(query, k=k_val)
409
-
410
- for item in search:
411
- print(item[0].page_content)
412
- print(f"Page: {item[0].metadata['source']}")
413
- print(f"Date: {item[0].metadata['date']}")
414
- print(f"Score: {item[1]}")
415
- print("---")
 
1
+ # Install/ import stuff we need
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  import os
4
+ import time
5
  import re
6
  import pandas as pd
7
+ import gradio as gr
8
+ from typing import Type, List, Literal
9
 
10
+ from pydantic import BaseModel, Field
11
+
12
+ # Creating an alias for pandas DataFrame using Type
13
+ PandasDataFrame = Type[pd.DataFrame]
14
+
15
+ # class Document(BaseModel):
16
+ # """Class for storing a piece of text and associated metadata. Implementation adapted from Langchain code: https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py"""
17
+
18
+ # page_content: str
19
+ # """String text."""
20
+ # metadata: dict = Field(default_factory=dict)
21
+ # """Arbitrary metadata about the page content (e.g., source, relationships to other
22
+ # documents, etc.).
23
+ # """
24
+ # type: Literal["Document"] = "Document"
25
+
26
+ class Document(BaseModel):
27
+ """Class for storing a piece of text and associated metadata. Implementation adapted from Langchain code: https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py"""
28
+
29
+ page_content: str
30
+ """String text."""
31
+ metadata: dict = Field(default_factory=dict)
32
+ """Arbitrary metadata about the page content (e.g., source, relationships to other
33
+ documents, etc.).
34
+ """
35
+ type: Literal["Document"] = "Document"
36
 
 
 
 
37
 
 
38
  # -
39
 
40
  split_strat = ["\n\n", "\n", ". ", "! ", "? "]
 
80
  # '.html': parse_html,
81
  # '.htm': parse_html, # Considering both .html and .htm for HTML files
82
  '.csv': lambda file_path: parse_csv_or_excel(file_path, text_column),
83
+ '.xlsx': lambda file_path: parse_csv_or_excel(file_path, text_column),
84
+ '.parquet': lambda file_path: parse_csv_or_excel(file_path, text_column)
85
  }
86
 
87
  parsed_contents = {}
 
147
  if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
148
  df['source'] = file_name
149
  df['page_section'] = ""
150
+ elif file_extension == ".parquet":
151
+ df = pd.read_parquet(file_path.name)
152
+ if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
153
+ df['source'] = file_name
154
+ df['page_section'] = ""
155
  else:
156
  print(f"Unsupported file type: {file_extension}")
157
  return pd.DataFrame(), ['Please choose a valid file type']
 
 
 
 
 
 
 
158
  return df, file_names
159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  def get_file_path_end(file_path):
161
  match = re.search(r'(.*[\/\\])?(.+)$', file_path)
162
 
 
215
  metadata_string = [f"{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
216
  return metadata_string
217
 
218
+ def combine_metadata_columns(df, cols):
219
+
220
+ df['metadatas'] = "{"
221
+ df['blank_column'] = ""
222
+
223
+ for n, col in enumerate(cols):
224
+ df[col] = df[col].astype(str).str.replace('"',"'").str.cat(df['blank_column'].astype(str), sep="")
225
+
226
+ df['metadatas'] = df['metadatas'] + '"' + cols[n] + '": "' + df[col] + '", '
227
+
228
+
229
+ df['metadatas'] = (df['metadatas'] + "}").str.replace(", }", "}")
230
+
231
+ return df['metadatas']
232
+
233
  def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]:
234
  """Converts a DataFrame's content to a list of Documents with metadata."""
235
 
 
247
  if col != text_column:
248
  metadata[col] = value
249
 
250
+ # metadata_string = write_out_metadata_as_string(metadata)[0]
251
 
252
  # If chunk_size is provided, split the text into chunks
253
  if chunk_size:
 
273
 
274
  return doc_sections
275
 
276
+ import ast
277
+
278
+ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None, progress=gr.Progress()) -> List[Document]:
279
+ """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
280
+
281
+ ingest_tic = time.perf_counter()
282
+
283
+ doc_sections = []
284
+ df[text_column] = df[text_column].astype(str).str.strip() # Ensure column is a string column
285
+
286
+ cols = [col for col in df.columns if col != text_column]
287
+
288
+ df["metadata"] = combine_metadata_columns(df, cols)
289
+
290
+ df = df.rename(columns={text_column:"page_content"})
291
+
292
+ #print(df[["page_content", "metadata"]].to_dict(orient='records'))
293
+
294
+ #doc_sections = df[["page_content", "metadata"]].to_dict(orient='records')
295
+ #doc_sections = [Document(**row) for row in df[["page_content", "metadata"]].to_dict(orient='records')]
296
+
297
+ # Create a list of Document objects
298
+ doc_sections = [Document(page_content=row['page_content'],
299
+ metadata= ast.literal_eval(row["metadata"]))
300
+ for index, row in progress.tqdm(df.iterrows(), desc = "Splitting up text", unit = "rows")]
301
+
302
+ ingest_toc = time.perf_counter()
303
+
304
+ ingest_time_out = f"Preparing documents took {ingest_toc - ingest_tic:0.1f} seconds"
305
+ print(ingest_time_out)
306
+
307
+ return doc_sections, "Finished splitting documents"
308
+
309
  # # Functions for working with documents after loading them back in
310
 
311
  def pull_out_data(series):
 
362
  doc_sources = [d['source'] for d in docs_meta]
363
 
364
  return out_df, docs_content, docs_meta, doc_sources