seanpedrickcase commited on
Commit
ea0dd40
1 Parent(s): 2806807

Changed embedding model to MiniLM-L6 as faster. Compressed embeddings are now int8. General improvements to API mode

Browse files
Dockerfile CHANGED
@@ -1,11 +1,8 @@
1
  # First stage: build dependencies
2
- #FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
3
 
4
- # Trying Python 10 as I saw somewhere that Python 11 may result in corrupted openpyxl xlsx outputs
5
- FROM public.ecr.aws/docker/library/python:3.10.14-slim-bookworm
6
-
7
- # Install Lambda web adapter in case you want to run with with an AWS Lamba function URL
8
- COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.3 /lambda-adapter /opt/extensions/lambda-adapter
9
 
10
  # Install wget
11
  RUN apt-get update && apt-get install -y wget
@@ -20,14 +17,12 @@ COPY requirements.txt .
20
  RUN pip install --no-cache-dir -r requirements.txt
21
 
22
  # Gradio needs to be installed after due to conflict with spacy in requirements
23
- RUN pip install --no-cache-dir gradio==4.36.1
24
 
25
- # Download the BGE embedding model during the build process
26
- RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
27
- RUN apt-get install git-lfs -y
28
- RUN git lfs install
29
- RUN git clone https://huggingface.co/BAAI/bge-small-en-v1.5 /model/bge
30
- RUN rm -rf /model/bge/.git
31
 
32
  # Set up a new user named "user" with user ID 1000
33
  RUN useradd -m -u 1000 user
@@ -47,6 +42,7 @@ ENV HOME=/home/user \
47
  PATH=/home/user/.local/bin:$PATH \
48
  PYTHONPATH=$HOME/app \
49
  PYTHONUNBUFFERED=1 \
 
50
  GRADIO_ALLOW_FLAGGING=never \
51
  GRADIO_NUM_PORTS=1 \
52
  GRADIO_SERVER_NAME=0.0.0.0 \
 
1
  # First stage: build dependencies
2
+ FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
3
 
4
+ # Optional - install Lambda web adapter in case you want to run with with an AWS Lamba function URL
5
+ # COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.3 /lambda-adapter /opt/extensions/lambda-adapter
 
 
 
6
 
7
  # Install wget
8
  RUN apt-get update && apt-get install -y wget
 
17
  RUN pip install --no-cache-dir -r requirements.txt
18
 
19
  # Gradio needs to be installed after due to conflict with spacy in requirements
20
+ RUN pip install --no-cache-dir gradio==4.37.2
21
 
22
+ # Download the BGE embedding model during the build process. Create a directory for the model and download specific files using huggingface_hub
23
+ RUN mkdir -p /model/minilm
24
+ COPY download_model.py /src/download_model.py
25
+ RUN python /src/download_model.py
 
 
26
 
27
  # Set up a new user named "user" with user ID 1000
28
  RUN useradd -m -u 1000 user
 
42
  PATH=/home/user/.local/bin:$PATH \
43
  PYTHONPATH=$HOME/app \
44
  PYTHONUNBUFFERED=1 \
45
+ PYTHONDONTWRITEBYTECODE=1 \
46
  GRADIO_ALLOW_FLAGGING=never \
47
  GRADIO_NUM_PORTS=1 \
48
  GRADIO_SERVER_NAME=0.0.0.0 \
app.py CHANGED
@@ -7,7 +7,7 @@ PandasDataFrame = Type[pd.DataFrame]
7
 
8
  from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
9
  from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
10
- from search_funcs.semantic_functions import docs_to_bge_embed_np_array, bge_semantic_search
11
  from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder, get_connection_params, output_folder
12
  from search_funcs.spacy_search_funcs import spacy_fuzzy_search
13
  from search_funcs.aws_functions import load_data_from_aws
@@ -24,24 +24,29 @@ with app:
24
 
25
  # BM25 state objects
26
  orig_keyword_data_state = gr.State(pd.DataFrame()) # Original data that is not changed #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
 
27
  prepared_keyword_data_state = gr.State(pd.DataFrame()) # Data frame the contains modified data #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
28
  #tokenised_prepared_keyword_data_state = gr.State([]) # This is data that has been loaded in as tokens #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State()
29
  tokenised_prepared_keyword_data_state = gr.State([]) # Data that has been prepared for search (tokenised) #gr.Dataframe(np.array([]), type="array", visible=False) #gr.State([])
30
- bm25_search_index_state = gr.State()
31
-
32
 
33
  # Semantic search state objects
34
  orig_semantic_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(),visible=False) # gr.State(pd.DataFrame())
35
  semantic_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(),visible=False) # gr.State(pd.DataFrame())
36
  semantic_input_document_format = gr.State([])
 
 
 
37
  embeddings_state = gr.State(np.array([])) #gr.Dataframe(np.array([]), type="numpy", visible=False) #gr.State(np.array([])) # globals()["embeddings"]
 
 
38
  semantic_k_val = gr.Number(9999, visible=False)
39
 
40
  # State objects for app in general
41
  session_hash_state = gr.State("")
42
  s3_output_folder_state = gr.State("")
43
  join_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(), visible=False) #gr.State(pd.DataFrame())
44
- output_file_state = gr.Dropdown([], visible=False, allow_custom_value=True) #gr.Dataframe(type="array", visible=False) #gr.State([])
45
 
46
  # Informational state objects
47
  in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
@@ -95,7 +100,7 @@ depends on factors such as the type of documents or queries. Information taken f
95
  """
96
  **Thematic/semantic search**
97
 
98
- This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
99
  """)
100
 
101
  with gr.Row():
@@ -122,7 +127,7 @@ depends on factors such as the type of documents or queries. Information taken f
122
  with gr.Row():
123
  in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
124
  return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
125
- embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
126
  #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
127
  with gr.Accordion(label="Keyword search options", open = False):
128
  with gr.Row():
@@ -194,12 +199,12 @@ depends on factors such as the type of documents or queries. Information taken f
194
  # Load in a csv/excel file for semantic search
195
  in_semantic_file.change(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, orig_semantic_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, semantic_load_progress, current_source_semantic])
196
  load_semantic_data_button.click(
197
- csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[semantic_input_document_format, semantic_load_progress, output_file_state]).\
198
- then(docs_to_bge_embed_np_array, inputs=[semantic_input_document_format, in_semantic_file, embeddings_state, output_file_state, in_clean_data, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, embeddings_state, semantic_output_file, output_file_state]) # vectorstore_state
199
-
200
  # Semantic search query
201
- semantic_submit.click(bge_semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic_search")
202
- semantic_query.submit(bge_semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
203
 
204
  app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state])
205
 
 
7
 
8
  from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
9
  from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
10
+ from search_funcs.semantic_functions import load_embedding_model, docs_to_bge_embed_np_array, bge_semantic_search
11
  from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder, get_connection_params, output_folder
12
  from search_funcs.spacy_search_funcs import spacy_fuzzy_search
13
  from search_funcs.aws_functions import load_data_from_aws
 
24
 
25
  # BM25 state objects
26
  orig_keyword_data_state = gr.State(pd.DataFrame()) # Original data that is not changed #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
27
+ #orig_keyword_data_state = gr.State(pd.DataFrame()) # Original data that is not changed #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
28
  prepared_keyword_data_state = gr.State(pd.DataFrame()) # Data frame the contains modified data #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
29
  #tokenised_prepared_keyword_data_state = gr.State([]) # This is data that has been loaded in as tokens #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State()
30
  tokenised_prepared_keyword_data_state = gr.State([]) # Data that has been prepared for search (tokenised) #gr.Dataframe(np.array([]), type="array", visible=False) #gr.State([])
31
+ bm25_search_index_state = gr.State()
 
32
 
33
  # Semantic search state objects
34
  orig_semantic_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(),visible=False) # gr.State(pd.DataFrame())
35
  semantic_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(),visible=False) # gr.State(pd.DataFrame())
36
  semantic_input_document_format = gr.State([])
37
+
38
+ embeddings_model_name_state = gr.State("sentence-transformers/all-MiniLM-L6-v2")#"BAAI/bge-small-en-v1.5")
39
+ embeddings_model_loc_state = gr.State("minilm/")#"bge/")
40
  embeddings_state = gr.State(np.array([])) #gr.Dataframe(np.array([]), type="numpy", visible=False) #gr.State(np.array([])) # globals()["embeddings"]
41
+ embeddings_model_state = gr.State()
42
+ torch_device_state = gr.State("cpu")
43
  semantic_k_val = gr.Number(9999, visible=False)
44
 
45
  # State objects for app in general
46
  session_hash_state = gr.State("")
47
  s3_output_folder_state = gr.State("")
48
  join_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(), visible=False) #gr.State(pd.DataFrame())
49
+ output_file_state = gr.State([]) #gr.Dataframe(type="array", visible=False) #gr.State([])
50
 
51
  # Informational state objects
52
  in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
 
100
  """
101
  **Thematic/semantic search**
102
 
103
+ This search type enables you to search for general terms (e.g. happiness, nature) and the search will pick out text passages that are most semantically similar to them. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
104
  """)
105
 
106
  with gr.Row():
 
127
  with gr.Row():
128
  in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
129
  return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
130
+ embeddings_compress = gr.Dropdown(label = "Round embeddings to int8 precision for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
131
  #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
132
  with gr.Accordion(label="Keyword search options", open = False):
133
  with gr.Row():
 
199
  # Load in a csv/excel file for semantic search
200
  in_semantic_file.change(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, orig_semantic_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, semantic_load_progress, current_source_semantic])
201
  load_semantic_data_button.click(
202
+ csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[semantic_input_document_format, semantic_load_progress, output_file_state], api_name="convert_texts_to_documents").\
203
+ then(docs_to_bge_embed_np_array, inputs=[semantic_input_document_format, in_semantic_file, output_file_state, in_clean_data, embeddings_state, embeddings_model_name_state, embeddings_model_loc_state, return_intermediate_files, embeddings_compress], outputs=[semantic_load_progress, embeddings_state, semantic_output_file, output_file_state, embeddings_model_state], api_name="embed_documents")
204
+
205
  # Semantic search query
206
+ semantic_submit.click(bge_semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, embeddings_model_state, embeddings_model_name_state, embeddings_compress, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic_search")
207
+ semantic_query.submit(bge_semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, embeddings_model_state, embeddings_model_name_state, embeddings_compress, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
208
 
209
  app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state])
210
 
download_model.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import hf_hub_download
2
+
3
+ # Define the repository and files to download
4
+ repo_id = "sentence-transformers/all-MiniLM-L6-v2" #"BAAI/bge-small-en-v1.5"
5
+ files_to_download = [
6
+ "config.json",
7
+ "pytorch_model.bin",
8
+ "tokenizer_config.json",
9
+ "vocab.txt"
10
+ ]
11
+
12
+ # Download each file and save it to the /model/bge directory
13
+ for file_name in files_to_download:
14
+ print("Checking for file", file_name)
15
+ hf_hub_download(repo_id=repo_id, filename=file_name, local_dir="/model/minilm") #"/model/bge"
output/36de65711121889ccdcb768b85e97e386d8fe4bd/keyword_search_result_20240702_school.xlsm ADDED
Binary file (9.92 kB). View file
 
requirements.txt CHANGED
@@ -2,7 +2,7 @@ pandas==2.2.2
2
  polars==0.20.3
3
  pyarrow==14.0.2
4
  openpyxl==3.1.3
5
- torch==2.3.1
6
  spacy
7
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
8
  gradio
 
2
  polars==0.20.3
3
  pyarrow==14.0.2
4
  openpyxl==3.1.3
5
+ torch==2.3.1 --index-url https://download.pytorch.org/whl/cpu
6
  spacy
7
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
8
  gradio
requirements_gpu.txt CHANGED
@@ -2,7 +2,7 @@ pandas==2.2.2
2
  polars==0.20.3
3
  pyarrow==14.0.2
4
  openpyxl==3.1.3
5
- torch==2.3.1 --index-url https://download.pytorch.org/whl/cu121
6
  spacy
7
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
8
  gradio
 
2
  polars==0.20.3
3
  pyarrow==14.0.2
4
  openpyxl==3.1.3
5
+ torch==2.4.0 --index-url https://download.pytorch.org/whl/nightly/cu121
6
  spacy
7
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
8
  gradio
search_funcs/bm25_functions.py CHANGED
@@ -15,28 +15,7 @@ from datetime import datetime
15
  today_rev = datetime.now().strftime("%Y%m%d")
16
 
17
  from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
18
- from search_funcs.helper_functions import get_file_path_end_with_ext, get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists, output_folder
19
-
20
- # Load the SpaCy model
21
- from spacy.cli.download import download
22
- import spacy
23
- spacy.prefer_gpu()
24
-
25
- #os.system("python -m spacy download en_core_web_sm")
26
- try:
27
- import en_core_web_sm
28
- nlp = en_core_web_sm.load()
29
- print("Successfully imported spaCy model")
30
- #nlp = spacy.load("en_core_web_sm")
31
- #print(nlp._path)
32
- except:
33
- download("en_core_web_sm")
34
- nlp = spacy.load("en_core_web_sm")
35
- print("Successfully imported spaCy model")
36
- #print(nlp._path)
37
-
38
- # including punctuation rules and exceptions
39
- tokenizer = nlp.tokenizer
40
 
41
  PARAM_K1 = 1.5
42
  PARAM_B = 0.75
@@ -230,6 +209,35 @@ class BM25:
230
  with open(f"{output_folder}{filename}.pkl", "rb") as fsave:
231
  return pickle.load(fsave)
232
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  def prepare_bm25_input_data(
234
  in_file: list,
235
  text_column: str,
@@ -348,9 +356,8 @@ def prepare_bm25_input_data(
348
  else:
349
  tokeniser_tic = time.perf_counter()
350
  prepared_search_text_list = []
351
- batch_size = 256
352
- for doc in tokenizer.pipe(progress.tqdm(prepared_text_as_list, desc = "Tokenising text", unit = "rows"), batch_size=batch_size):
353
- prepared_search_text_list.append([token.text for token in doc])
354
 
355
  tokeniser_toc = time.perf_counter()
356
  tokenizer_time_out = f"Tokenising the text took {tokeniser_toc - tokeniser_tic:0.1f} seconds."
@@ -519,26 +526,18 @@ def prepare_bm25(
519
 
520
  return message, None, bm25, prepared_search_text_list
521
 
522
- def convert_bm25_query_to_tokens(free_text_query, clean="No"):
523
- '''
524
- Split open text query into tokens and then lemmatise to get the core of the word. Currently 'clean' has no effect.
525
- '''
526
-
527
- if clean=="Yes":
528
- split_query = tokenizer(free_text_query.lower())
529
- out_query = [token.text for token in split_query]
530
- #out_query = stem_sentence(out_query)
531
- else:
532
- split_query = tokenizer(free_text_query.lower())
533
- out_query = [token.text for token in split_query]
534
 
535
- print("Search query out is:", out_query)
 
536
 
537
- if isinstance(out_query,str):
538
- print("Converting string")
539
- out_query = [out_query]
540
 
541
- return out_query
542
 
543
  def bm25_search(
544
  free_text_query: str,
@@ -596,9 +595,11 @@ def bm25_search(
596
  Returns
597
  -------
598
  tuple
599
- A tuple containing a message, the search results file name (if any), the BM25 object, and the prepared search text list.
600
  """
601
 
 
 
602
  progress(0, desc = "Conducting keyword search")
603
 
604
  print("in_join_file at start of bm25_search:", in_join_file)
@@ -611,10 +612,7 @@ def bm25_search(
611
  # print("bm25:", bm25)
612
 
613
  # Prepare query
614
- if (clean == "Yes") | (text_column.endswith("_cleaned")):
615
- token_query = convert_bm25_query_to_tokens(free_text_query, clean="Yes")
616
- else:
617
- token_query = convert_bm25_query_to_tokens(free_text_query, clean="No")
618
 
619
  # Perform search
620
  print("Searching")
@@ -685,6 +683,13 @@ def bm25_search(
685
 
686
  results_first_text = results_df_out[text_column].iloc[0]
687
 
 
 
 
 
 
 
 
688
  print("Returning results")
689
 
690
- return results_first_text, results_df_name
 
15
  today_rev = datetime.now().strftime("%Y%m%d")
16
 
17
  from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
18
+ from search_funcs.helper_functions import get_file_path_end_with_ext, get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists, output_folder, load_spacy_model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  PARAM_K1 = 1.5
21
  PARAM_B = 0.75
 
209
  with open(f"{output_folder}{filename}.pkl", "rb") as fsave:
210
  return pickle.load(fsave)
211
 
212
+
213
+ def tokenise_text_spacy(prepared_text_as_list:List[str], progress = gr.Progress(track_tqdm=True)):
214
+ '''
215
+ Tokenise a list of texts using the spaCy package and the en_core_web_sm model.
216
+ '''
217
+
218
+ # Load spaCy model
219
+ nlp = load_spacy_model()
220
+
221
+ prepared_search_text_list = []
222
+ batch_size = 256
223
+ for doc in nlp.tokenizer.pipe(progress.tqdm(prepared_text_as_list, desc = "Tokenising text", unit = "rows"), batch_size=batch_size):
224
+ prepared_search_text_list.append([token.text for token in doc])
225
+
226
+ return prepared_search_text_list
227
+
228
+ def tokenise_text_nltk(prepared_text_as_list: List[str], progress= gr.Progress(track_tqdm=True)):
229
+ """
230
+ Tokenise a list of texts using the NLTK package.
231
+ """
232
+ import nltk
233
+ nltk.download('punkt', quiet=True) # Download the necessary resource if not already present
234
+
235
+ prepared_search_text_list = []
236
+ for text in progress.tqdm(prepared_text_as_list, desc="Tokenising text", unit="rows"):
237
+ prepared_search_text_list.append(nltk.word_tokenize(text.lower())) # Lowercase for consistency
238
+
239
+ return prepared_search_text_list
240
+
241
  def prepare_bm25_input_data(
242
  in_file: list,
243
  text_column: str,
 
356
  else:
357
  tokeniser_tic = time.perf_counter()
358
  prepared_search_text_list = []
359
+
360
+ prepared_search_text_list = tokenise_text_spacy(prepared_text_as_list)
 
361
 
362
  tokeniser_toc = time.perf_counter()
363
  tokenizer_time_out = f"Tokenising the text took {tokeniser_toc - tokeniser_tic:0.1f} seconds."
 
526
 
527
  return message, None, bm25, prepared_search_text_list
528
 
529
+ def convert_bm25_query_to_tokens(free_text_query:str):
530
+ """
531
+ Split open text query into tokens.
532
+ """
533
+ split_query = tokenise_text_spacy([free_text_query.lower()])
 
 
 
 
 
 
 
534
 
535
+ # Flatten the list of lists into a single list
536
+ flattened_query = [token for sublist in split_query for token in sublist]
537
 
538
+ print("Search query out is:", flattened_query)
 
 
539
 
540
+ return flattened_query
541
 
542
  def bm25_search(
543
  free_text_query: str,
 
595
  Returns
596
  -------
597
  tuple
598
+ A tuple containing a message and the output search results files (if any).
599
  """
600
 
601
+ output_files = []
602
+
603
  progress(0, desc = "Conducting keyword search")
604
 
605
  print("in_join_file at start of bm25_search:", in_join_file)
 
612
  # print("bm25:", bm25)
613
 
614
  # Prepare query
615
+ token_query = convert_bm25_query_to_tokens(free_text_query)
 
 
 
616
 
617
  # Perform search
618
  print("Searching")
 
683
 
684
  results_first_text = results_df_out[text_column].iloc[0]
685
 
686
+ output_files.append(results_df_name)
687
+
688
+ csv_output_file = output_folder + "keyword_search_result_" + today_rev + "_" + query_str_file + ".csv"
689
+ results_df_out.to_csv(csv_output_file, index=None)
690
+
691
+ output_files.append(csv_output_file)
692
+
693
  print("Returning results")
694
 
695
+ return results_first_text, output_files
search_funcs/clean_funcs.py CHANGED
@@ -1,13 +1,9 @@
1
  # ## Some functions to clean text
2
-
3
  import re
4
  import string
5
- import polars as pl
6
 
7
  # Add calendar months onto stop words
8
  import calendar
9
- #from tqdm import tqdm
10
- import gradio as gr
11
 
12
  from typing import List
13
 
@@ -15,7 +11,6 @@ from typing import List
15
  custom_words = []
16
  my_stop_words = custom_words
17
 
18
-
19
  cal_month = (list(calendar.month_name))
20
  cal_month = [x.lower() for x in cal_month]
21
 
@@ -24,7 +19,6 @@ cal_month = [i for i in cal_month if i]
24
  #print(cal_month)
25
  custom_words.extend(cal_month)
26
 
27
-
28
  # #### Some of my cleaning functions
29
  replace_backslash = r'\\'
30
  email_start_pattern_regex = r'.*importance:|.*subject:'
@@ -37,19 +31,19 @@ warning_pattern_regex = r'caution: this email originated from outside of the org
37
  nbsp_pattern_regex = r' '
38
  multiple_spaces_regex = r'\s{2,}'
39
 
40
- # Pre-compiling the regular expressions for efficiency
41
- # email_start_pattern = re.compile(email_start_pattern_regex)
42
- # email_end_pattern = re.compile(email_end_pattern_regex)
43
- # html_pattern = re.compile(html_pattern_regex)
44
- # email_pattern = re.compile(email_end_pattern_regex)
45
- # num_pattern = re.compile(num_pattern_regex)
46
- # postcode_pattern = re.compile(postcode_pattern_regex)
47
- # warning_pattern = re.compile(warning_pattern_regex)
48
- # nbsp_pattern = re.compile(nbsp_pattern_regex)
49
 
 
 
 
 
 
 
 
50
 
51
- def initial_clean(texts:List[str] , progress=gr.Progress()):
52
- texts = pl.Series(texts)#[]
53
 
54
  text = texts.str.replace_all(replace_backslash, '/')
55
  text = text.str.replace_all(html_pattern_regex, '')
@@ -62,6 +56,33 @@ def initial_clean(texts:List[str] , progress=gr.Progress()):
62
 
63
  return text
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  def remove_hyphens(text_text):
66
  return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', text_text)
67
 
 
1
  # ## Some functions to clean text
 
2
  import re
3
  import string
 
4
 
5
  # Add calendar months onto stop words
6
  import calendar
 
 
7
 
8
  from typing import List
9
 
 
11
  custom_words = []
12
  my_stop_words = custom_words
13
 
 
14
  cal_month = (list(calendar.month_name))
15
  cal_month = [x.lower() for x in cal_month]
16
 
 
19
  #print(cal_month)
20
  custom_words.extend(cal_month)
21
 
 
22
  # #### Some of my cleaning functions
23
  replace_backslash = r'\\'
24
  email_start_pattern_regex = r'.*importance:|.*subject:'
 
31
  nbsp_pattern_regex = r' '
32
  multiple_spaces_regex = r'\s{2,}'
33
 
34
+ def initial_clean(texts:List[str]):
35
+ """
36
+ This function cleans a list of text strings by performing various replacements using polars.
 
 
 
 
 
 
37
 
38
+ Args:
39
+ texts (List[str]): A list of strings to clean.
40
+
41
+ Returns:
42
+ List[str]: A list of cleaned strings.
43
+ """
44
+ import polars as pl
45
 
46
+ texts = pl.Series(texts)
 
47
 
48
  text = texts.str.replace_all(replace_backslash, '/')
49
  text = text.str.replace_all(html_pattern_regex, '')
 
56
 
57
  return text
58
 
59
+
60
+ def initial_clean_pandas(texts: List[str]):
61
+ """
62
+ This function cleans a list of text strings by performing various replacements using pandas.
63
+
64
+ Args:
65
+ texts (List[str]): A list of strings to clean.
66
+
67
+ Returns:
68
+ List[str]: A list of cleaned strings.
69
+ """
70
+ import pandas as pd
71
+
72
+ # Create a pandas Series from the text list for easier manipulation
73
+ text_series = pd.Series(texts)
74
+
75
+ # Replace patterns with pandas string methods (`.str.replace`)
76
+ text_series = text_series.astype(str).str.replace(replace_backslash, '/', regex=True)
77
+ text_series = text_series.astype(str).str.replace(html_pattern_regex, '', regex=True)
78
+ text_series = text_series.astype(str).str.replace(email_start_pattern_regex, '', regex=True)
79
+ text_series = text_series.astype(str).str.replace(email_end_pattern_regex, '', regex=True)
80
+ text_series = text_series.astype(str).str.replace(email_pattern_regex, '', regex=True)
81
+ text_series = text_series.astype(str).str.replace(multiple_spaces_regex, ' ', regex=True)
82
+
83
+ # Convert cleaned Series back to a list
84
+ return text_series.tolist()
85
+
86
  def remove_hyphens(text_text):
87
  return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', text_text)
88
 
search_funcs/helper_functions.py CHANGED
@@ -67,7 +67,7 @@ def get_connection_params(request: gr.Request):
67
  #print("Query parameters:", dict(request.query_params))
68
  # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
69
  #print("Request dictionary to object:", request.request.body())
70
- print("Session hash:", request.session_hash)
71
 
72
  if 'x-cognito-id' in request.headers:
73
  out_session_hash = request.headers['x-cognito-id']
@@ -77,11 +77,11 @@ def get_connection_params(request: gr.Request):
77
  else:
78
  out_session_hash = request.session_hash
79
  base_folder = "temp-files/"
80
- print("Cognito ID not found. Using session hash as save folder.")
81
 
82
  output_folder = base_folder + out_session_hash + "/"
83
- if bucket_name:
84
- print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
85
 
86
  return out_session_hash, output_folder
87
  else:
@@ -281,6 +281,21 @@ def put_columns_in_join_df(in_file:str):
281
 
282
  return gr.Dropdown(choices=concat_choices), new_df, out_message
283
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
 
285
  def display_info(info_component):
286
  gr.Info(info_component)
 
67
  #print("Query parameters:", dict(request.query_params))
68
  # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
69
  #print("Request dictionary to object:", request.request.body())
70
+ #print("Session hash:", request.session_hash)
71
 
72
  if 'x-cognito-id' in request.headers:
73
  out_session_hash = request.headers['x-cognito-id']
 
77
  else:
78
  out_session_hash = request.session_hash
79
  base_folder = "temp-files/"
80
+ #print("Cognito ID not found. Using session hash as save folder.")
81
 
82
  output_folder = base_folder + out_session_hash + "/"
83
+ #if bucket_name:
84
+ # print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
85
 
86
  return out_session_hash, output_folder
87
  else:
 
281
 
282
  return gr.Dropdown(choices=concat_choices), new_df, out_message
283
 
284
+ def load_spacy_model():
285
+ # Load the SpaCy model
286
+ from spacy.cli.download import download
287
+ import spacy
288
+ spacy.prefer_gpu()
289
+
290
+ try:
291
+ import en_core_web_sm
292
+ nlp = en_core_web_sm.load()
293
+ print("Successfully imported spaCy model")
294
+ except:
295
+ download("en_core_web_sm")
296
+ nlp = spacy.load("en_core_web_sm")
297
+ print("Successfully imported spaCy model")
298
+ return nlp
299
 
300
  def display_info(info_component):
301
  gr.Info(info_component)
search_funcs/semantic_functions.py CHANGED
@@ -6,58 +6,61 @@ import gradio as gr
6
  import numpy as np
7
  from datetime import datetime
8
  from search_funcs.helper_functions import get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists, output_folder
9
- from torch import cuda, backends
10
- from sentence_transformers import SentenceTransformer
11
  PandasDataFrame = Type[pd.DataFrame]
12
-
13
  today_rev = datetime.now().strftime("%Y%m%d")
14
 
15
- # Check for torch cuda
16
- print("Is CUDA enabled? ", cuda.is_available())
17
- print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
18
- if cuda.is_available():
19
- torch_device = "cuda"
20
- os.system("nvidia-smi")
21
-
22
- else:
23
- torch_device = "cpu"
24
-
25
- print("Device used is: ", torch_device)
26
-
27
- # Load embeddings
28
- embeddings_name = "BAAI/bge-small-en-v1.5"
29
-
30
- # Define a list of possible local locations to search for the model
31
- local_embeddings_locations = [
32
- "model/bge/", # Potential local location
33
- "/model/bge/", # Potential location in Docker container
34
- "/home/user/app/model/bge/" # This is inside a Docker container
35
- ]
36
-
37
- # Attempt to load the model from each local location
38
- for location in local_embeddings_locations:
39
- try:
40
- embeddings_model = SentenceTransformer(location)
41
- print(f"Found local model installation at: {location}")
42
- break # Exit the loop if the model is found
43
- except Exception as e:
44
- print(f"Failed to load model from {location}: {e}")
45
- continue
46
- else:
47
- # If the loop completes without finding the model in any local location
48
- embeddings_model = SentenceTransformer(embeddings_name)
49
- print("Could not find local model installation. Downloading from Huggingface")
50
-
 
 
 
 
 
51
 
52
  def docs_to_bge_embed_np_array(
53
  docs_out: list,
54
  in_file: list,
55
- embeddings_state: np.ndarray,
56
  output_file_state: str,
57
- clean: str,
 
 
 
58
  return_intermediate_files: str = "No",
59
- embeddings_super_compress: str = "No",
60
- embeddings_model: SentenceTransformer = embeddings_model,
61
  progress: gr.Progress = gr.Progress(track_tqdm=True)
62
  ) -> tuple:
63
  """
@@ -66,18 +69,20 @@ def docs_to_bge_embed_np_array(
66
  Parameters:
67
  - docs_out (list): List of documents to be embedded.
68
  - in_file (list): List of input files.
69
- - embeddings_state (np.ndarray): Current state of embeddings.
70
  - output_file_state (str): State of the output file.
71
  - clean (str): Indicates if the data should be cleaned.
 
 
 
72
  - return_intermediate_files (str, optional): Whether to return intermediate files. Default is "No".
73
- - embeddings_super_compress (str, optional): Whether to super compress the embeddings. Default is "No".
74
- - embeddings_model (SentenceTransformer, optional): The embeddings model to use. Default is embeddings_model.
75
  - progress (gr.Progress, optional): Progress tracker for the function. Default is gr.Progress(track_tqdm=True).
76
 
77
  Returns:
78
  - tuple: A tuple containing the output message, embeddings, and output file state.
79
  """
80
 
 
81
 
82
  ensure_output_folder_exists(output_folder)
83
 
@@ -102,12 +107,29 @@ def docs_to_bge_embed_np_array(
102
 
103
  out_message = "Document processing complete. Ready to search."
104
 
105
-
106
  if embeddings_state.size == 0:
107
  tic = time.perf_counter()
108
  print("Starting to embed documents.")
109
 
110
- embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = 32, normalize_embeddings=True) # For BGE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  toc = time.perf_counter()
113
  time_out = f"The embedding took {toc - tic:0.1f} seconds"
@@ -119,27 +141,25 @@ def docs_to_bge_embed_np_array(
119
  else: data_file_name_no_ext = data_file_name_no_ext
120
 
121
  progress(0.9, desc = "Saving embeddings to file")
122
- if embeddings_super_compress == "No":
123
  semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embeddings.npz'
124
- np.savez_compressed(semantic_search_file_name, embeddings_out)
125
  else:
126
  semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embedding_compress.npz'
127
- embeddings_out_round = np.round(embeddings_out, 3)
128
- embeddings_out_round *= 100 # Rounding not currently used
129
- np.savez_compressed(semantic_search_file_name, embeddings_out_round)
130
 
131
  output_file_state.append(semantic_search_file_name)
132
 
133
- return out_message, embeddings_out, output_file_state, output_file_state
134
 
135
- return out_message, embeddings_out, output_file_state, output_file_state
136
  else:
137
  # Just return existing embeddings if already exist
138
  embeddings_out = embeddings_state
139
 
140
  print(out_message)
141
 
142
- return out_message, embeddings_out, output_file_state, output_file_state
143
 
144
  def process_data_from_scores_df(
145
  df_docs: pd.DataFrame,
@@ -226,14 +246,15 @@ def bge_semantic_search(
226
  embeddings: np.ndarray,
227
  documents: list,
228
  k_val: int,
229
- vec_score_cut_off: float,
 
 
 
230
  in_join_file: pd.DataFrame,
231
  in_join_column: str = None,
232
- search_df_join_column: str = None,
233
- device: str = torch_device,
234
- embeddings_model: SentenceTransformer = embeddings_model,
235
  progress: gr.Progress = gr.Progress(track_tqdm=True)
236
- ) -> pd.DataFrame:
237
  """
238
  Perform a semantic search using the BGE model.
239
 
@@ -243,33 +264,83 @@ def bge_semantic_search(
243
  - documents (list): The list of documents to search.
244
  - k_val (int): The number of top results to return.
245
  - vec_score_cut_off (float): The score cutoff for filtering results.
 
 
 
246
  - in_join_file (pd.DataFrame): The DataFrame to join with the search results.
247
  - in_join_column (str, optional): The column name in the join DataFrame to join on. Default is None.
248
- - search_df_join_column (str, optional): The column name in the search DataFrame to join on. Default is None.
249
- - device (str, optional): The device to run the model on. Default is torch_device.
250
- - embeddings_model (SentenceTransformer, optional): The embeddings model to use. Default is embeddings_model.
251
  - progress (gr.Progress, optional): Progress tracker for the function. Default is gr.Progress(track_tqdm=True).
252
 
253
  Returns:
254
- - pd.DataFrame: The DataFrame containing the search results.
255
  """
256
 
257
  progress(0, desc = "Conducting semantic search")
258
 
 
 
259
  ensure_output_folder_exists(output_folder)
260
 
261
  print("Searching")
262
 
263
- # Load the sentence transformer model and move it to GPU
264
- embeddings_model = embeddings_model.to(device)
265
 
266
  # Encode the query using the sentence transformer and convert to a PyTorch tensor
267
- query = embeddings_model.encode(query_str, normalize_embeddings=True)
 
 
 
 
 
 
 
 
268
 
269
- # Sentence transformers method, not used:
270
- cosine_similarities = query @ embeddings.T
271
 
272
- # Flatten the tensor to a 1D array
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  cosine_similarities = cosine_similarities.flatten()
274
 
275
  # Create a Pandas Series
@@ -309,6 +380,13 @@ def bge_semantic_search(
309
  #results_df_out.to_excel(results_df_name, index= None)
310
  results_first_text = results_df_out.iloc[0, 1]
311
 
 
 
 
 
 
 
 
312
  print("Returning results")
313
 
314
  return results_first_text, results_df_name
 
6
  import numpy as np
7
  from datetime import datetime
8
  from search_funcs.helper_functions import get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists, output_folder
 
 
9
  PandasDataFrame = Type[pd.DataFrame]
 
10
  today_rev = datetime.now().strftime("%Y%m%d")
11
 
12
+ def load_embedding_model(embeddings_name = "BAAI/bge-small-en-v1.5", embedding_loc="bge/"):
13
+
14
+ from torch import cuda, backends
15
+ from sentence_transformers import SentenceTransformer
16
+
17
+ # Check for torch cuda
18
+ print("Is CUDA enabled? ", cuda.is_available())
19
+ print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
20
+ if cuda.is_available():
21
+ torch_device = "cuda"
22
+ #os.system("nvidia-smi")
23
+ else:
24
+ torch_device = "cpu"
25
+
26
+ print("Device used is: ", torch_device)
27
+
28
+ # Define a list of possible local locations to search for the model
29
+ local_embeddings_locations = [
30
+ "model/" + embedding_loc, # Potential local location
31
+ "/model/" + embedding_loc, # Potential location in Docker container
32
+ "/home/user/app/model/" + embedding_loc # This is inside a Docker container
33
+ ]
34
+
35
+ # Attempt to load the model from each local location
36
+ for location in local_embeddings_locations:
37
+ try:
38
+ embeddings_model = SentenceTransformer(location)
39
+ print(f"Found local model installation at: {location}")
40
+ break # Exit the loop if the model is found
41
+ except Exception as e:
42
+ print(f"Failed to load model from {location}: {e}")
43
+ continue
44
+ else:
45
+ # If the loop completes without finding the model in any local location
46
+ embeddings_model = SentenceTransformer(embeddings_name)
47
+ print("Could not find local model installation. Downloading from Huggingface")
48
+
49
+ # Load the sentence transformer model and move it to CPU/GPU
50
+ embeddings_model = embeddings_model.to(torch_device)
51
+
52
+ return embeddings_model, torch_device
53
 
54
  def docs_to_bge_embed_np_array(
55
  docs_out: list,
56
  in_file: list,
 
57
  output_file_state: str,
58
+ clean: str,
59
+ embeddings_state: np.ndarray,
60
+ embeddings_model_name:str,
61
+ embeddings_model_loc:str,
62
  return_intermediate_files: str = "No",
63
+ embeddings_compress: str = "No",
 
64
  progress: gr.Progress = gr.Progress(track_tqdm=True)
65
  ) -> tuple:
66
  """
 
69
  Parameters:
70
  - docs_out (list): List of documents to be embedded.
71
  - in_file (list): List of input files.
 
72
  - output_file_state (str): State of the output file.
73
  - clean (str): Indicates if the data should be cleaned.
74
+ - embeddings_state (np.ndarray): Current state of embeddings.
75
+ - embeddings_model_name (str): The Huggingface repo name of the embeddings model.
76
+ - embeddings_model_loc (str): Embeddings model save location.
77
  - return_intermediate_files (str, optional): Whether to return intermediate files. Default is "No".
78
+ - embeddings_compress (str, optional): Whether to compress the embeddings to int8 precision. Default is "No".
 
79
  - progress (gr.Progress, optional): Progress tracker for the function. Default is gr.Progress(track_tqdm=True).
80
 
81
  Returns:
82
  - tuple: A tuple containing the output message, embeddings, and output file state.
83
  """
84
 
85
+ embeddings_model, torch_device = load_embedding_model(embeddings_model_name, embeddings_model_loc)
86
 
87
  ensure_output_folder_exists(output_folder)
88
 
 
107
 
108
  out_message = "Document processing complete. Ready to search."
109
 
 
110
  if embeddings_state.size == 0:
111
  tic = time.perf_counter()
112
  print("Starting to embed documents.")
113
 
114
+ # Encode embeddings. If in normal mode, float32, if in 'super compress' mode, int8
115
+ batch_size = 32
116
+
117
+ if "bge" in embeddings_model_name:
118
+ print("Embedding with BGE model")
119
+ if embeddings_compress == "No":
120
+ print("Embedding with full fp32 precision")
121
+ embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, normalize_embeddings=True)
122
+ else:
123
+ print("Embedding with int8 precision")
124
+ embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, normalize_embeddings=True, precision="int8")
125
+ else:
126
+ print("Embedding with MiniLM-L6-v2 model")
127
+ if embeddings_compress == "No":
128
+ print("Embedding with full fp32 precision")
129
+ embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size)
130
+ else:
131
+ print("Embedding with int8 precision")
132
+ embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, precision="int8")
133
 
134
  toc = time.perf_counter()
135
  time_out = f"The embedding took {toc - tic:0.1f} seconds"
 
141
  else: data_file_name_no_ext = data_file_name_no_ext
142
 
143
  progress(0.9, desc = "Saving embeddings to file")
144
+ if embeddings_compress == "No":
145
  semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embeddings.npz'
 
146
  else:
147
  semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embedding_compress.npz'
148
+
149
+ np.savez_compressed(semantic_search_file_name, embeddings_out)
 
150
 
151
  output_file_state.append(semantic_search_file_name)
152
 
153
+ return out_message, embeddings_out, output_file_state, output_file_state, embeddings_model
154
 
155
+ return out_message, embeddings_out, output_file_state, output_file_state, embeddings_model
156
  else:
157
  # Just return existing embeddings if already exist
158
  embeddings_out = embeddings_state
159
 
160
  print(out_message)
161
 
162
+ return out_message, embeddings_out, output_file_state, output_file_state, embeddings_model
163
 
164
  def process_data_from_scores_df(
165
  df_docs: pd.DataFrame,
 
246
  embeddings: np.ndarray,
247
  documents: list,
248
  k_val: int,
249
+ vec_score_cut_off: float,
250
+ embeddings_model,
251
+ embeddings_model_name: str,
252
+ embeddings_compress:str,
253
  in_join_file: pd.DataFrame,
254
  in_join_column: str = None,
255
+ search_df_join_column: str = None,
 
 
256
  progress: gr.Progress = gr.Progress(track_tqdm=True)
257
+ ) -> tuple:
258
  """
259
  Perform a semantic search using the BGE model.
260
 
 
264
  - documents (list): The list of documents to search.
265
  - k_val (int): The number of top results to return.
266
  - vec_score_cut_off (float): The score cutoff for filtering results.
267
+ - embeddings_model (SentenceTransformer, optional): The embeddings model to use.
268
+ - embeddings_model_name (str): The Huggingface repo name of the embeddings model.
269
+ - embeddings_compress (str): Whether the embeddings have been compressed to int8 precision
270
  - in_join_file (pd.DataFrame): The DataFrame to join with the search results.
271
  - in_join_column (str, optional): The column name in the join DataFrame to join on. Default is None.
272
+ - search_df_join_column (str, optional): The column name in the search DataFrame to join on. Default is None.
 
 
273
  - progress (gr.Progress, optional): Progress tracker for the function. Default is gr.Progress(track_tqdm=True).
274
 
275
  Returns:
276
+ - tuple: The DataFrame containing the search results.
277
  """
278
 
279
  progress(0, desc = "Conducting semantic search")
280
 
281
+ output_files = []
282
+
283
  ensure_output_folder_exists(output_folder)
284
 
285
  print("Searching")
286
 
287
+ from sentence_transformers import quantize_embeddings
 
288
 
289
  # Encode the query using the sentence transformer and convert to a PyTorch tensor
290
+ if "bge" in embeddings_model_name:
291
+ if embeddings_compress == "Yes":
292
+ query_fp32 = embeddings_model.encode(query_str, normalize_embeddings=True)
293
+
294
+ #query = query_fp32
295
+ query = quantize_embeddings(
296
+ query_fp32,
297
+ precision="int8",
298
+ calibration_embeddings=embeddings)
299
 
300
+ else:
301
+ query = embeddings_model.encode(query_str, normalize_embeddings=True)
302
 
303
+ # Get cosine similarities
304
+ cosine_similarities = query @ embeddings.T
305
+
306
+ # Sentence transformers method, not used:
307
+ #cosine_similarities = query @ embeddings.T
308
+
309
+ #cosine_similarities = embeddings_model.similarity(query, embeddings)
310
+ # Flatten the tensor to a 1D array
311
+ #cosine_similarities = cosine_similarities.flatten()
312
+ else:
313
+ print("Comparing similarity using Minilm-L6-v2")
314
+
315
+ if embeddings_compress == "Yes":
316
+ query_fp32 = embeddings_model.encode(query_str, normalize_embeddings=True)
317
+
318
+ #query = query_fp32
319
+ query = quantize_embeddings(
320
+ query_fp32,
321
+ precision="int8",
322
+ calibration_embeddings=embeddings)
323
+ else:
324
+ query = embeddings_model.encode(query_str, normalize_embeddings=True)
325
+
326
+ #cosine_similarities = embeddings_model.cosine_similarity(query, embeddings)
327
+
328
+ print("query:", query_fp32)
329
+ print("embeddings:", embeddings)
330
+
331
+ embeddings_norm = np.linalg.norm(embeddings, axis=1)
332
+
333
+ embeddings_norm = np.linalg.norm(embeddings, axis=1, keepdims=True) # Keep dims to allow broadcasting
334
+ normalized_embeddings = embeddings / embeddings_norm
335
+
336
+ print("normalized_embeddings:", normalized_embeddings)
337
+
338
+ expanded_query_fp32 = np.expand_dims(query_fp32, axis=0)
339
+ cosine_similarities = (expanded_query_fp32 @ normalized_embeddings.T)
340
+
341
+ print("Initial cosine similarities:", cosine_similarities)
342
+
343
+ # Flatten the tensor to a 1D array
344
  cosine_similarities = cosine_similarities.flatten()
345
 
346
  # Create a Pandas Series
 
380
  #results_df_out.to_excel(results_df_name, index= None)
381
  results_first_text = results_df_out.iloc[0, 1]
382
 
383
+ output_files.append(results_df_name)
384
+
385
+ csv_output_file = output_folder + "semantic_search_result_" + today_rev + "_" + query_str_file + ".csv"
386
+ results_df_out.to_csv(csv_output_file, index=None)
387
+
388
+ output_files.append(csv_output_file)
389
+
390
  print("Returning results")
391
 
392
  return results_first_text, results_df_name
search_funcs/semantic_ingest_functions.py CHANGED
@@ -1,18 +1,15 @@
1
- # Install/ import packages
2
  import time
3
- import re
4
  import ast
5
  import gzip
6
  import pandas as pd
7
  import gradio as gr
8
  import pickle
9
  from typing import Type, List, Literal
10
- #from langchain.text_splitter import RecursiveCharacterTextSplitter
11
-
12
  from pydantic import BaseModel, Field
13
 
14
  # Creating an alias for pandas DataFrame using Type
15
  PandasDataFrame = Type[pd.DataFrame]
 
16
 
17
  class Document(BaseModel):
18
  """Class for storing a piece of text and associated metadata. Implementation adapted from Langchain code: https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py"""
@@ -25,114 +22,21 @@ class Document(BaseModel):
25
  """
26
  type: Literal["Document"] = "Document"
27
 
28
- # Constants for chunking - not currently used
29
- split_strat = ["\n\n", "\n", ". ", "! ", "? "]
30
- chunk_size = 512
31
- chunk_overlap = 0
32
- start_index = True
33
-
34
- from search_funcs.helper_functions import get_file_path_end_with_ext, detect_file_type, get_file_path_end, ensure_output_folder_exists
35
  from search_funcs.bm25_functions import save_prepared_bm25_data, output_folder
36
  from search_funcs.clean_funcs import initial_clean
37
 
38
- def parse_file_not_used(file_paths, text_column='text'):
39
- """
40
- Accepts a list of file paths, determines each file's type based on its extension,
41
- and passes it to the relevant parsing function.
42
-
43
- Parameters:
44
- file_paths (list): List of file paths.
45
- text_column (str): Name of the column in CSV/Excel files that contains the text content.
46
-
47
- Returns:
48
- dict: A dictionary with file paths as keys and their parsed content (or error message) as values.
49
- """
50
-
51
-
52
-
53
- if not isinstance(file_paths, list):
54
- raise ValueError("Expected a list of file paths.")
55
-
56
- extension_to_parser = {
57
- # '.pdf': parse_pdf,
58
- # '.docx': parse_docx,
59
- # '.txt': parse_txt,
60
- # '.html': parse_html,
61
- # '.htm': parse_html, # Considering both .html and .htm for HTML files
62
- '.csv': lambda file_path: parse_csv_or_excel(file_path, text_column),
63
- '.xlsx': lambda file_path: parse_csv_or_excel(file_path, text_column),
64
- '.parquet': lambda file_path: parse_csv_or_excel(file_path, text_column)
65
- }
66
-
67
- parsed_contents = {}
68
- file_names = []
69
-
70
- for file_path in file_paths:
71
-
72
- file_extension = detect_file_type(file_path.name)
73
- if file_extension in extension_to_parser:
74
- parsed_contents[file_path.name] = extension_to_parser[file_extension](file_path.name)
75
- else:
76
- parsed_contents[file_path.name] = f"Unsupported file type: {file_extension}"
77
-
78
- filename_end = get_file_path_end_with_ext(file_path.name)
79
-
80
- file_names.append(filename_end)
81
-
82
- return parsed_contents, file_names
83
-
84
- def text_regex_clean(text):
85
- # Merge hyphenated words
86
- text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
87
- # If a double newline ends in a letter, add a full stop.
88
- text = re.sub(r'(?<=[a-zA-Z])\n\n', '.\n\n', text)
89
- # Fix newlines in the middle of sentences
90
- text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
91
- # Remove multiple newlines
92
- text = re.sub(r"\n\s*\n", "\n\n", text)
93
- text = re.sub(r" ", " ", text)
94
- # Add full stops and new lines between words with no space between where the second one has a capital letter
95
- text = re.sub(r'(?<=[a-z])(?=[A-Z])', '. \n\n', text)
96
-
97
- return text
98
-
99
- def parse_csv_or_excel(file_path, data_state, text_column = "text"):
100
- """
101
- Read in a CSV or Excel file.
102
-
103
- Parameters:
104
- file_path (str): Path to the CSV file.
105
- text_column (str): Name of the column in the CSV file that contains the text content.
106
-
107
- Returns:
108
- Pandas DataFrame: Dataframe output from file read
109
- """
110
-
111
- file_list = [string.name for string in file_path]
112
-
113
- #print(file_list)
114
-
115
- data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
116
-
117
- data_file_name = data_file_names[0]
118
-
119
- #for file_path in file_paths:
120
- file_name = get_file_path_end_with_ext(data_file_name)
121
-
122
- message = "Loaded in file. Now converting to document format."
123
- print(message)
124
-
125
- return data_state, file_name, message
126
-
127
- def write_out_metadata_as_string(metadata_in):
128
- # If metadata_in is a single dictionary, wrap it in a list
129
- if isinstance(metadata_in, dict):
130
- metadata_in = [metadata_in]
131
 
132
- metadata_string = [f"{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
133
- return metadata_string
 
134
 
135
- def combine_metadata_columns(df, cols):
 
 
136
 
137
  df['metadata'] = '{'
138
  df['blank_column'] = ''
@@ -147,32 +51,14 @@ def combine_metadata_columns(df, cols):
147
 
148
  return df['metadata']
149
 
150
- def split_string_into_chunks(input_string, max_length, split_symbols):
151
- # Check if input_string or split_symbols are empty
152
- if not input_string or not split_symbols:
153
- return [input_string]
154
-
155
- chunks = []
156
- current_chunk = ""
157
-
158
- for char in input_string:
159
- current_chunk += char
160
- if len(current_chunk) >= max_length or char in split_symbols:
161
- # Add the current chunk to the chunks list
162
- chunks.append(current_chunk)
163
- current_chunk = ""
164
-
165
- # Adding any remaining part of the string
166
- if current_chunk:
167
- chunks.append(current_chunk)
168
-
169
- return chunks
170
-
171
- def clean_line_breaks(text):
172
- # Replace \n and \r\n with a space
173
  return text.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ')
174
 
175
  def parse_metadata(row):
 
 
 
176
  try:
177
  # Ensure the 'title' field is a string and clean line breaks
178
  #if 'TITLE' in row:
@@ -193,8 +79,20 @@ def parse_metadata(row):
193
  # Handle the error or log it
194
  return None # or some default value
195
 
196
- def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
197
- """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
  ensure_output_folder_exists(output_folder)
200
  output_list = []
@@ -212,7 +110,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
212
  return doc_sections, "Please load in at least one csv/Excel/parquet data file.", output_list
213
 
214
  if not text_column:
215
- return None, "Please enter a column name to search"
216
 
217
  data_file_name = data_file_names[0]
218
 
@@ -246,7 +144,6 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
246
 
247
  df[text_column] = df_list
248
 
249
-
250
  clean_toc = time.perf_counter()
251
  clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
252
  print(clean_time_out)
@@ -285,26 +182,4 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
285
  output_list.append(out_doc_file_name)
286
  print("Documents saved to file.")
287
 
288
- return doc_sections, "Finished preparing documents.", output_list
289
-
290
- def document_to_dataframe(documents):
291
- '''
292
- Convert an object in document format to pandas dataframe
293
- '''
294
- rows = []
295
-
296
- for doc in documents:
297
- # Convert Document to dictionary and extract metadata
298
- doc_dict = doc.dict()
299
- metadata = doc_dict.pop('metadata')
300
-
301
- # Add the page_content and type to the metadata
302
- metadata['page_content'] = doc_dict['page_content']
303
- metadata['type'] = doc_dict['type']
304
-
305
- # Add to the list of rows
306
- rows.append(metadata)
307
-
308
- # Create a DataFrame from the list of rows
309
- df = pd.DataFrame(rows)
310
- return df
 
 
1
  import time
 
2
  import ast
3
  import gzip
4
  import pandas as pd
5
  import gradio as gr
6
  import pickle
7
  from typing import Type, List, Literal
 
 
8
  from pydantic import BaseModel, Field
9
 
10
  # Creating an alias for pandas DataFrame using Type
11
  PandasDataFrame = Type[pd.DataFrame]
12
+ PandasSeries = Type[pd.Series]
13
 
14
  class Document(BaseModel):
15
  """Class for storing a piece of text and associated metadata. Implementation adapted from Langchain code: https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py"""
 
22
  """
23
  type: Literal["Document"] = "Document"
24
 
25
+ from search_funcs.helper_functions import get_file_path_end, ensure_output_folder_exists
 
 
 
 
 
 
26
  from search_funcs.bm25_functions import save_prepared_bm25_data, output_folder
27
  from search_funcs.clean_funcs import initial_clean
28
 
29
+ def combine_metadata_columns(df:PandasDataFrame, cols:List[str]) -> PandasSeries:
30
+ '''
31
+ Construct a metadata column as a string version of a dictionary for later parsing.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ Parameters:
34
+ - df (PandasDataFrame): Data frame of search data.
35
+ - cols (List[str]): List of column names that will be included in the output metadata column.
36
 
37
+ Returns:
38
+ - PandasSeries: A series containing the metadata elements combined into a dictionary format as a string.
39
+ '''
40
 
41
  df['metadata'] = '{'
42
  df['blank_column'] = ''
 
51
 
52
  return df['metadata']
53
 
54
+ def clean_line_breaks(text:str):
55
+ '''Replace \n and \r\n with a space'''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  return text.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ')
57
 
58
  def parse_metadata(row):
59
+ '''
60
+ Parse a string instance of a dictionary into a Python object.
61
+ '''
62
  try:
63
  # Ensure the 'title' field is a string and clean line breaks
64
  #if 'TITLE' in row:
 
79
  # Handle the error or log it
80
  return None # or some default value
81
 
82
+ def csv_excel_text_to_docs(df:PandasDataFrame, in_file:List[str], text_column:str, clean:str = "No", return_intermediate_files:str = "No", progress=gr.Progress(track_tqdm=True)) -> tuple:
83
+ """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata.
84
+
85
+ Parameters:
86
+ - df (PandasDataFrame): Data frame of search data.
87
+ - in_file (List[str]): List of input file names.
88
+ - text_column (str): The text column that will be searched.
89
+ - clean (str): Whether the text is cleaned before searching.
90
+ - return_intermediate_files (str): Whether intermediate processing files are saved to file.
91
+ - progress (gr.Progress, optional): The progress tracker for the operation.
92
+
93
+ Returns:
94
+ - tuple: A tuple containing data outputs in a Document class format, an output message, and a list of output file paths.
95
+ """
96
 
97
  ensure_output_folder_exists(output_folder)
98
  output_list = []
 
110
  return doc_sections, "Please load in at least one csv/Excel/parquet data file.", output_list
111
 
112
  if not text_column:
113
+ return None, "Please enter a column name to search", output_list
114
 
115
  data_file_name = data_file_names[0]
116
 
 
144
 
145
  df[text_column] = df_list
146
 
 
147
  clean_toc = time.perf_counter()
148
  clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
149
  print(clean_time_out)
 
182
  output_list.append(out_doc_file_name)
183
  print("Documents saved to file.")
184
 
185
+ return doc_sections, "Finished preparing documents.", output_list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
search_funcs/spacy_search_funcs.py CHANGED
@@ -7,30 +7,19 @@ import gradio as gr
7
  import pandas as pd
8
  from typing import List, Type
9
  from datetime import datetime
10
- from search_funcs.helper_functions import create_highlighted_excel_wb, output_folder
11
 
12
  PandasDataFrame = Type[pd.DataFrame]
13
 
14
  today_rev = datetime.now().strftime("%Y%m%d")
15
 
16
- # Load the SpaCy model
17
 
18
- #os.system("python -m spacy download en_core_web_sm")
19
- try:
20
- import en_core_web_sm
21
- nlp = en_core_web_sm.load()
22
- print("Successfully imported spaCy model")
23
- #nlp = spacy.load("en_core_web_sm")
24
- #print(nlp._path)
25
- except:
26
- download("en_core_web_sm")
27
- nlp = spacy.load("en_core_web_sm")
28
- print("Successfully imported spaCy model")
29
 
30
  def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
31
  ''' Conduct fuzzy match on a list of data.'''
32
 
33
- #print("df_list:", df_list)
 
34
 
35
  # Convert tokenised data back into a list of strings
36
  df_list = list(map(" ".join, tokenised_data))
 
7
  import pandas as pd
8
  from typing import List, Type
9
  from datetime import datetime
10
+ from search_funcs.helper_functions import create_highlighted_excel_wb, output_folder, load_spacy_model
11
 
12
  PandasDataFrame = Type[pd.DataFrame]
13
 
14
  today_rev = datetime.now().strftime("%Y%m%d")
15
 
 
16
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
19
  ''' Conduct fuzzy match on a list of data.'''
20
 
21
+ # Load spaCy model
22
+ nlp = load_spacy_model()
23
 
24
  # Convert tokenised data back into a list of strings
25
  df_list = list(map(" ".join, tokenised_data))