seanpedrickcase commited on
Commit
7f029b5
1 Parent(s): ea0dd40

Now accepts .zip file as inputs. Moved semantic search option bar. Minor API mode changes.

Browse files
.dockerignore CHANGED
@@ -1,7 +1,6 @@
1
  *.csv
2
  *.pyc
3
  *.cpython-311.pyc
4
- *.cpython-310.pyc
5
  *.bat
6
  *.json
7
  *.xlsx
@@ -16,6 +15,7 @@
16
  *.pkl
17
  *.pkl.gz
18
  *.pem
 
19
  docs/*
20
  build/*
21
  dist/*
 
1
  *.csv
2
  *.pyc
3
  *.cpython-311.pyc
 
4
  *.bat
5
  *.json
6
  *.xlsx
 
15
  *.pkl
16
  *.pkl.gz
17
  *.pem
18
+ *.zip
19
  docs/*
20
  build/*
21
  dist/*
.gitignore CHANGED
@@ -1,7 +1,6 @@
1
  *.csv
2
  *.pyc
3
  *.cpython-311.pyc
4
- *.cpython-310.pyc
5
  *.bat
6
  *.json
7
  *.xlsx
@@ -18,6 +17,7 @@
18
  *.pem
19
  *.json.out
20
  *.env
 
21
  docs/*
22
  build/*
23
  dist/*
 
1
  *.csv
2
  *.pyc
3
  *.cpython-311.pyc
 
4
  *.bat
5
  *.json
6
  *.xlsx
 
17
  *.pem
18
  *.json.out
19
  *.env
20
+ *.zip
21
  docs/*
22
  build/*
23
  dist/*
app.py CHANGED
@@ -78,7 +78,7 @@ depends on factors such as the type of documents or queries. Information taken f
78
  current_source = gr.Textbox(label="Current data source(s)", value="None")
79
 
80
  with gr.Accordion(label = "Load in data", open=True):
81
- in_bm25_file = gr.File(label="Upload data for keyword search", file_count= 'multiple', file_types =['.parquet', '.csv', '.pkl', '.pkl.gz'])
82
  with gr.Row():
83
  in_bm25_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
84
  load_bm25_data_button = gr.Button(value="Load data")
@@ -107,7 +107,7 @@ depends on factors such as the type of documents or queries. Information taken f
107
  current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
108
 
109
  with gr.Accordion("Load in data", open = True):
110
- in_semantic_file = gr.File(label="Upload data file for semantic search", file_count= 'multiple', file_types = ['.parquet', '.csv', '.npy', '.npz', '.pkl', '.pkl.gz'])
111
 
112
  with gr.Row():
113
  in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
@@ -115,6 +115,9 @@ depends on factors such as the type of documents or queries. Information taken f
115
 
116
  semantic_load_progress = gr.Textbox(label="Load progress")
117
 
 
 
 
118
  semantic_query = gr.Textbox(label="Enter semantic search query here")
119
  semantic_submit = gr.Button(value="Start semantic search", variant="primary")
120
 
@@ -146,8 +149,7 @@ depends on factors such as the type of documents or queries. Information taken f
146
  in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
147
  with gr.Accordion(label="Fuzzy search options", open = False):
148
  no_spelling_mistakes = gr.Slider(label = "Number of spelling mistakes allowed in fuzzy search", value = 1, minimum=1, maximum=4, step=1)
149
- with gr.Accordion(label="Semantic search options", open = False):
150
- semantic_min_distance = gr.Slider(label = "Minimum distance score for search result to be included", value = 0.6, minimum=0, maximum=0.95, step=0.01)
151
  with gr.Accordion(label = "Join on additional dataframes to results", open = False):
152
  in_join_file = gr.File(label="Upload your data to join here")
153
  in_join_message = gr.Textbox(label="Join file load progress")
@@ -180,7 +182,7 @@ depends on factors such as the type of documents or queries. Information taken f
180
 
181
  ### BM25 SEARCH ###
182
  # Update dropdowns upon initial file load
183
- in_bm25_file.change(initial_data_load, inputs=[in_bm25_file], outputs=[in_bm25_column, search_df_join_column, prepared_keyword_data_state, orig_keyword_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, load_finished_message, current_source], api_name="initial_load")
184
  in_join_file.change(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
185
 
186
  # Load in BM25 data
@@ -197,7 +199,8 @@ depends on factors such as the type of documents or queries. Information taken f
197
  ### SEMANTIC SEARCH ###
198
 
199
  # Load in a csv/excel file for semantic search
200
- in_semantic_file.change(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, orig_semantic_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, semantic_load_progress, current_source_semantic])
 
201
  load_semantic_data_button.click(
202
  csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[semantic_input_document_format, semantic_load_progress, output_file_state], api_name="convert_texts_to_documents").\
203
  then(docs_to_bge_embed_np_array, inputs=[semantic_input_document_format, in_semantic_file, output_file_state, in_clean_data, embeddings_state, embeddings_model_name_state, embeddings_model_loc_state, return_intermediate_files, embeddings_compress], outputs=[semantic_load_progress, embeddings_state, semantic_output_file, output_file_state, embeddings_model_state], api_name="embed_documents")
 
78
  current_source = gr.Textbox(label="Current data source(s)", value="None")
79
 
80
  with gr.Accordion(label = "Load in data", open=True):
81
+ in_bm25_file = gr.File(label="Upload data for keyword search", file_count= 'multiple', file_types =['.parquet', '.csv', '.pkl', '.pkl.gz', '.zip'])
82
  with gr.Row():
83
  in_bm25_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
84
  load_bm25_data_button = gr.Button(value="Load data")
 
107
  current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
108
 
109
  with gr.Accordion("Load in data", open = True):
110
+ in_semantic_file = gr.File(label="Upload data file for semantic search", file_count= 'multiple', file_types = ['.parquet', '.csv', '.npy', '.npz', '.pkl', '.pkl.gz', '.zip'])
111
 
112
  with gr.Row():
113
  in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
 
115
 
116
  semantic_load_progress = gr.Textbox(label="Load progress")
117
 
118
+ with gr.Accordion(label="Semantic search options", open = False):
119
+ semantic_min_distance = gr.Slider(label = "Minimum distance score for search result to be included", value = 0.2, minimum=0, maximum=0.95, step=0.01)
120
+
121
  semantic_query = gr.Textbox(label="Enter semantic search query here")
122
  semantic_submit = gr.Button(value="Start semantic search", variant="primary")
123
 
 
149
  in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
150
  with gr.Accordion(label="Fuzzy search options", open = False):
151
  no_spelling_mistakes = gr.Slider(label = "Number of spelling mistakes allowed in fuzzy search", value = 1, minimum=1, maximum=4, step=1)
152
+
 
153
  with gr.Accordion(label = "Join on additional dataframes to results", open = False):
154
  in_join_file = gr.File(label="Upload your data to join here")
155
  in_join_message = gr.Textbox(label="Join file load progress")
 
182
 
183
  ### BM25 SEARCH ###
184
  # Update dropdowns upon initial file load
185
+ in_bm25_file.upload(initial_data_load, inputs=[in_bm25_file], outputs=[in_bm25_column, search_df_join_column, prepared_keyword_data_state, orig_keyword_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, load_finished_message, current_source, in_bm25_file], api_name="keyword_data_load")
186
  in_join_file.change(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
187
 
188
  # Load in BM25 data
 
199
  ### SEMANTIC SEARCH ###
200
 
201
  # Load in a csv/excel file for semantic search
202
+ in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, orig_semantic_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, semantic_load_progress, current_source_semantic, in_semantic_file], api_name="semantic_data_load")
203
+
204
  load_semantic_data_button.click(
205
  csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[semantic_input_document_format, semantic_load_progress, output_file_state], api_name="convert_texts_to_documents").\
206
  then(docs_to_bge_embed_np_array, inputs=[semantic_input_document_format, in_semantic_file, output_file_state, in_clean_data, embeddings_state, embeddings_model_name_state, embeddings_model_loc_state, return_intermediate_files, embeddings_compress], outputs=[semantic_load_progress, embeddings_state, semantic_output_file, output_file_state, embeddings_model_state], api_name="embed_documents")
search_funcs/bm25_functions.py CHANGED
@@ -685,10 +685,9 @@ def bm25_search(
685
 
686
  output_files.append(results_df_name)
687
 
688
- csv_output_file = output_folder + "keyword_search_result_" + today_rev + "_" + query_str_file + ".csv"
689
- results_df_out.to_csv(csv_output_file, index=None)
690
-
691
- output_files.append(csv_output_file)
692
 
693
  print("Returning results")
694
 
 
685
 
686
  output_files.append(results_df_name)
687
 
688
+ #csv_output_file = output_folder + "keyword_search_result_" + today_rev + "_" + query_str_file + ".csv"
689
+ #results_df_out.to_csv(csv_output_file, index=None)
690
+ #output_files.append(csv_output_file)
 
691
 
692
  print("Returning results")
693
 
search_funcs/helper_functions.py CHANGED
@@ -6,6 +6,7 @@ import os
6
  import shutil
7
  import getpass
8
  import gzip
 
9
  import pickle
10
  import numpy as np
11
 
@@ -177,7 +178,40 @@ def read_file(filename):
177
 
178
  return file
179
 
180
- def initial_data_load(in_file:List[str]):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  '''
182
  When file is loaded, update the column dropdown choices and relevant state variables
183
  '''
@@ -192,10 +226,15 @@ def initial_data_load(in_file:List[str]):
192
 
193
  file_list = [string.name for string in in_file]
194
 
195
- #print(file_list)
 
 
 
 
 
196
 
197
  data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "search_index" not in string.lower()]
198
- print(data_file_names)
199
 
200
  if not data_file_names:
201
  out_message = "Please load in at least one csv/Excel/parquet data file."
@@ -204,9 +243,10 @@ def initial_data_load(in_file:List[str]):
204
 
205
  # This if you have loaded in a documents object for the semantic search
206
  if "pkl" in data_file_names[0]:
 
207
  df = read_file(data_file_names[0])
208
  new_choices = list(df[0].metadata.keys()) #["Documents"] #["page_contents"] +
209
- current_source = get_file_path_end_with_ext(data_file_names[0])
210
 
211
  # This if you have loaded in a csv/parquets/xlsx
212
  else:
@@ -231,11 +271,14 @@ def initial_data_load(in_file:List[str]):
231
 
232
  concat_choices.extend(new_choices)
233
 
 
 
234
  # Check if there is a search index file already
235
- index_file_names = [string for string in file_list if "gz" in string.lower()]
236
 
237
  if index_file_names:
238
  index_file_name = index_file_names[0]
 
239
  index_load = read_file(index_file_name)
240
 
241
  embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
@@ -254,10 +297,10 @@ def initial_data_load(in_file:List[str]):
254
  if tokenised_file_names:
255
  tokenised_load = read_file(tokenised_file_names[0])
256
 
257
- out_message = "Initial data check successful. Next, choose a data column to search in the drop down above, then click 'Load data'"
258
  print(out_message)
259
 
260
- return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, df, index_load, embed_load, tokenised_load, out_message, current_source
261
 
262
  def put_columns_in_join_df(in_file:str):
263
  '''
 
6
  import shutil
7
  import getpass
8
  import gzip
9
+ import zipfile
10
  import pickle
11
  import numpy as np
12
 
 
178
 
179
  return file
180
 
181
+ def process_zip_files(file_list, progress=gr.Progress(track_tqdm=True)):
182
+ """
183
+ Processes a list of file names, unzipping any ZIP files found
184
+ and adding the extracted file names to the list.
185
+
186
+ Args:
187
+ file_list: A list of file names (strings).
188
+ """
189
+ progress(0.1, desc="Unzipping zip files")
190
+
191
+ i = 0
192
+ while i < len(file_list): # Use 'while' for dynamic list changes
193
+ file_path = file_list[i]
194
+
195
+ if file_path.endswith(".zip"):
196
+ try:
197
+ zip_dir = os.path.dirname(file_path) or "." # Get zip file's directory or use current if none
198
+ with zipfile.ZipFile(file_path, 'r') as zip_ref:
199
+ zip_ref.extractall(zip_dir) # Extract to zip's directory
200
+ #print("List of files in zip:", zip_ref.namelist())
201
+ extracted_files = [os.path.join(zip_dir, name) for name in zip_ref.namelist()]
202
+ file_list.extend(extracted_files)
203
+
204
+ except zipfile.BadZipFile:
205
+ print(f"Warning: '{file_path}' is not a valid zip file.")
206
+
207
+ i += 1
208
+
209
+ file_list = [file for file in file_list if not file.endswith(".zip")]
210
+ print("file_list after files in zip extracted:", file_list)
211
+
212
+ return file_list
213
+
214
+ def initial_data_load(in_file:List[str], progress = gr.Progress(track_tqdm=True)):
215
  '''
216
  When file is loaded, update the column dropdown choices and relevant state variables
217
  '''
 
226
 
227
  file_list = [string.name for string in in_file]
228
 
229
+ # If a zip file is loaded, unzip it and add the file names to the file_list
230
+ file_list = process_zip_files(file_list)
231
+
232
+ #print("File_list that makes it to main data load function:", file_list)
233
+
234
+ progress(0.3, desc="Loading in data files")
235
 
236
  data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "search_index" not in string.lower()]
237
+ print("Data file names:", data_file_names)
238
 
239
  if not data_file_names:
240
  out_message = "Please load in at least one csv/Excel/parquet data file."
 
243
 
244
  # This if you have loaded in a documents object for the semantic search
245
  if "pkl" in data_file_names[0]:
246
+ print("Document object for semantic search:", data_file_names[0])
247
  df = read_file(data_file_names[0])
248
  new_choices = list(df[0].metadata.keys()) #["Documents"] #["page_contents"] +
249
+ current_source = get_file_path_end_with_ext(data_file_names[0])
250
 
251
  # This if you have loaded in a csv/parquets/xlsx
252
  else:
 
271
 
272
  concat_choices.extend(new_choices)
273
 
274
+ progress(0.6, desc="Loading in embedding/search index files")
275
+
276
  # Check if there is a search index file already
277
+ index_file_names = [string for string in file_list if ".gz" in string.lower()]
278
 
279
  if index_file_names:
280
  index_file_name = index_file_names[0]
281
+ print("Search index file name found:", index_file_name)
282
  index_load = read_file(index_file_name)
283
 
284
  embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
 
297
  if tokenised_file_names:
298
  tokenised_load = read_file(tokenised_file_names[0])
299
 
300
+ out_message = "Initial data load successful. Next, choose a data column to search in the drop down above, then click 'Load data'"
301
  print(out_message)
302
 
303
+ return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, df, index_load, embed_load, tokenised_load, out_message, current_source, file_list
304
 
305
  def put_columns_in_join_df(in_file:str):
306
  '''
search_funcs/semantic_functions.py CHANGED
@@ -1,4 +1,3 @@
1
- import os
2
  import time
3
  import pandas as pd
4
  from typing import Type
@@ -116,20 +115,15 @@ def docs_to_bge_embed_np_array(
116
 
117
  if "bge" in embeddings_model_name:
118
  print("Embedding with BGE model")
119
- if embeddings_compress == "No":
120
- print("Embedding with full fp32 precision")
121
- embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, normalize_embeddings=True)
122
- else:
123
- print("Embedding with int8 precision")
124
- embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, normalize_embeddings=True, precision="int8")
125
  else:
126
  print("Embedding with MiniLM-L6-v2 model")
127
- if embeddings_compress == "No":
128
- print("Embedding with full fp32 precision")
129
- embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size)
130
- else:
131
- print("Embedding with int8 precision")
132
- embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, precision="int8")
 
133
 
134
  toc = time.perf_counter()
135
  time_out = f"The embedding took {toc - tic:0.1f} seconds"
@@ -288,60 +282,43 @@ def bge_semantic_search(
288
 
289
  # Encode the query using the sentence transformer and convert to a PyTorch tensor
290
  if "bge" in embeddings_model_name:
291
- if embeddings_compress == "Yes":
292
- query_fp32 = embeddings_model.encode(query_str, normalize_embeddings=True)
293
-
294
- #query = query_fp32
295
- query = quantize_embeddings(
296
- query_fp32,
297
- precision="int8",
298
- calibration_embeddings=embeddings)
299
-
300
- else:
301
- query = embeddings_model.encode(query_str, normalize_embeddings=True)
302
 
303
- # Get cosine similarities
304
- cosine_similarities = query @ embeddings.T
305
 
306
- # Sentence transformers method, not used:
307
- #cosine_similarities = query @ embeddings.T
308
 
309
- #cosine_similarities = embeddings_model.similarity(query, embeddings)
310
- # Flatten the tensor to a 1D array
311
- #cosine_similarities = cosine_similarities.flatten()
 
 
312
  else:
313
- print("Comparing similarity using Minilm-L6-v2")
 
 
 
314
 
315
- if embeddings_compress == "Yes":
316
- query_fp32 = embeddings_model.encode(query_str, normalize_embeddings=True)
317
-
318
- #query = query_fp32
319
- query = quantize_embeddings(
320
- query_fp32,
321
- precision="int8",
322
- calibration_embeddings=embeddings)
323
- else:
324
- query = embeddings_model.encode(query_str, normalize_embeddings=True)
325
 
326
- #cosine_similarities = embeddings_model.cosine_similarity(query, embeddings)
327
 
328
- print("query:", query_fp32)
329
- print("embeddings:", embeddings)
330
 
331
- embeddings_norm = np.linalg.norm(embeddings, axis=1)
332
 
333
- embeddings_norm = np.linalg.norm(embeddings, axis=1, keepdims=True) # Keep dims to allow broadcasting
334
- normalized_embeddings = embeddings / embeddings_norm
335
 
336
- print("normalized_embeddings:", normalized_embeddings)
 
337
 
338
- expanded_query_fp32 = np.expand_dims(query_fp32, axis=0)
339
- cosine_similarities = (expanded_query_fp32 @ normalized_embeddings.T)
340
 
341
- print("Initial cosine similarities:", cosine_similarities)
342
-
343
- # Flatten the tensor to a 1D array
344
- cosine_similarities = cosine_similarities.flatten()
345
 
346
  # Create a Pandas Series
347
  cosine_similarities_series = pd.Series(cosine_similarities)
@@ -379,14 +356,12 @@ def bge_semantic_search(
379
 
380
  #results_df_out.to_excel(results_df_name, index= None)
381
  results_first_text = results_df_out.iloc[0, 1]
382
-
383
  output_files.append(results_df_name)
384
 
385
- csv_output_file = output_folder + "semantic_search_result_" + today_rev + "_" + query_str_file + ".csv"
386
- results_df_out.to_csv(csv_output_file, index=None)
387
-
388
- output_files.append(csv_output_file)
389
 
390
  print("Returning results")
391
 
392
- return results_first_text, results_df_name
 
 
1
  import time
2
  import pandas as pd
3
  from typing import Type
 
115
 
116
  if "bge" in embeddings_model_name:
117
  print("Embedding with BGE model")
 
 
 
 
 
 
118
  else:
119
  print("Embedding with MiniLM-L6-v2 model")
120
+
121
+ if embeddings_compress == "No":
122
+ print("Embedding with full fp32 precision")
123
+ embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size)
124
+ else:
125
+ print("Embedding with int8 precision")
126
+ embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, precision="int8")
127
 
128
  toc = time.perf_counter()
129
  time_out = f"The embedding took {toc - tic:0.1f} seconds"
 
282
 
283
  # Encode the query using the sentence transformer and convert to a PyTorch tensor
284
  if "bge" in embeddings_model_name:
285
+ print("Comparing similarity using BGE model")
286
+ else:
287
+ print("Comparing similarity using MiniLM-L6-v2 model")
 
 
 
 
 
 
 
 
288
 
 
 
289
 
290
+ if embeddings_compress == "Yes":
291
+ query_fp32 = embeddings_model.encode(query_str)
292
 
293
+ # Using a query as int8 doesn't actually seem to work
294
+ # query_int8 = quantize_embeddings(
295
+ # query_fp32, precision="int8", calibration_embeddings=embeddings
296
+ # )
297
+
298
  else:
299
+ query_fp32 = embeddings_model.encode(query_str)
300
+
301
+ #print("query:", query_fp32)
302
+ #print("embeddings:", embeddings)
303
 
304
+ # Normalise embeddings
 
 
 
 
 
 
 
 
 
305
 
306
+ query = query_fp32.astype('float32')
307
 
308
+ query_norm = np.linalg.norm(query)
309
+ normalized_query = query / query_norm
310
 
311
+ embeddings = embeddings.astype('float32')
312
 
313
+ embeddings_norm = np.linalg.norm(embeddings, axis=1, keepdims=True) # Keep dims to allow broadcasting
314
+ normalized_embeddings = embeddings / embeddings_norm
315
 
316
+ #print("normalized_query:", normalized_query)
317
+ #print("normalized_embeddings:", normalized_embeddings)
318
 
319
+ cosine_similarities = (normalized_query @ normalized_embeddings.T)
 
320
 
321
+ #print("Initial cosine similarities:", cosine_similarities)
 
 
 
322
 
323
  # Create a Pandas Series
324
  cosine_similarities_series = pd.Series(cosine_similarities)
 
356
 
357
  #results_df_out.to_excel(results_df_name, index= None)
358
  results_first_text = results_df_out.iloc[0, 1]
 
359
  output_files.append(results_df_name)
360
 
361
+ #csv_output_file = output_folder + "semantic_search_result_" + today_rev + "_" + query_str_file + ".csv"
362
+ #results_df_out.to_csv(csv_output_file, index=None)
363
+ #output_files.append(csv_output_file)
 
364
 
365
  print("Returning results")
366
 
367
+ return results_first_text, output_files
search_funcs/spacy_search_funcs.py CHANGED
@@ -1,7 +1,3 @@
1
- import spacy
2
- spacy.prefer_gpu()
3
- from spacy.cli.download import download
4
- from spacy.matcher import Matcher
5
  import numpy as np
6
  import gradio as gr
7
  import pandas as pd
@@ -13,11 +9,13 @@ PandasDataFrame = Type[pd.DataFrame]
13
 
14
  today_rev = datetime.now().strftime("%Y%m%d")
15
 
16
-
17
-
18
  def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
19
  ''' Conduct fuzzy match on a list of data.'''
20
 
 
 
 
 
21
  # Load spaCy model
22
  nlp = load_spacy_model()
23
 
 
 
 
 
 
1
  import numpy as np
2
  import gradio as gr
3
  import pandas as pd
 
9
 
10
  today_rev = datetime.now().strftime("%Y%m%d")
11
 
 
 
12
  def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
13
  ''' Conduct fuzzy match on a list of data.'''
14
 
15
+ import spacy
16
+ spacy.prefer_gpu()
17
+ from spacy.matcher import Matcher
18
+
19
  # Load spaCy model
20
  nlp = load_spacy_model()
21