Sean-Case commited on
Commit
3df8e40
1 Parent(s): 200480d

Fixed data input for semantic search. Allowed for docs to be loaded in directly for semantic search. 0.2.1

Browse files
README.md CHANGED
@@ -15,7 +15,7 @@ Search through long-form text fields in your tabular data. Either for exact, spe
15
  # Guide
16
  ## Keyword search
17
 
18
- 1. Load in your tabular data file (.csv, .parquet, .xlsx - first sheet). If the 'Keyword search' folder has been prepared, select both of the .parquet files in this folder (both the file with and without 'tokenised' in the name) to load into the app.
19
  2. Wait for the file(s) to upload, then in the dropdown menu below 'Enter the name of the text column...' choose the column from the data file that you want to search.
20
  3. Hit 'Load data'. The 'Load progress' text box will let you know when the file is ready.
21
  4. In the 'Enter your search term' area below this, type in the key words you want to find in your text. Note that if the term is not spelled exactly as it is found in the text, it will not be found!
 
15
  # Guide
16
  ## Keyword search
17
 
18
+ 1. Load in your tabular data file (.csv, .parquet, .xlsx - first sheet). If the 'Keyword search' folder has been prepared, select both of the .files in this folder (both the data file and the file ending 'search_index.pkl.gz') to load into the app.
19
  2. Wait for the file(s) to upload, then in the dropdown menu below 'Enter the name of the text column...' choose the column from the data file that you want to search.
20
  3. Hit 'Load data'. The 'Load progress' text box will let you know when the file is ready.
21
  4. In the 'Enter your search term' area below this, type in the key words you want to find in your text. Note that if the term is not spelled exactly as it is found in the text, it will not be found!
app.py CHANGED
@@ -28,12 +28,13 @@ with block:
28
  out_passages = gr.State(9999)
29
  vec_weight = gr.State(1)
30
 
31
- docs_keep_as_doc_state = gr.State()
32
- doc_df_state = gr.State()
33
- docs_keep_out_state = gr.State()
34
 
35
  corpus_state = gr.State()
36
- data_state = gr.State(pd.DataFrame())
 
37
 
38
  in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
39
  presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
@@ -58,13 +59,13 @@ depends on factors such as the type of documents or queries. Information taken f
58
  """
59
  **Exact term keyword search**
60
 
61
- 1. Load in data file (ideally a file with '_cleaned' at the end of the name), with (optionally) the '...tokenised.parquet' in the same folder to save loading time. 2. Select the field in your data to search. A field with the suffix '_cleaned' means that html tags have been removed. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the relevant box below and press Enter/click on 'Search text'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
62
  """)
63
  with gr.Row():
64
  current_source = gr.Textbox(label="Current data source(s)", value="None")
65
 
66
  with gr.Accordion(label = "Load in data", open=True):
67
- in_bm25_file = gr.File(label="Upload data for keyword search", file_count= 'multiple', file_types = ['.parquet', '.csv'])
68
  with gr.Row():
69
  in_bm25_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
70
  load_bm25_data_button = gr.Button(value="Load data")
@@ -148,22 +149,22 @@ depends on factors such as the type of documents or queries. Information taken f
148
 
149
  ### BM25 SEARCH ###
150
  # Update dropdowns upon initial file load
151
- in_bm25_file.upload(put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column, data_state])
152
  in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file, in_join_column], outputs=[in_join_column])
153
 
154
  # Load in BM25 data
155
- load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, data_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, data_state, output_file, output_file, current_source]).\
156
- then(fn=prepare_bm25, inputs=[corpus_state, in_k1, in_b, in_alpha], outputs=[load_finished_message])#.\
157
  #then(fn=put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column])
158
 
159
  # BM25 search functions on click or enter
160
- keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
161
- keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
162
 
163
  ### SEMANTIC SEARCH ###
164
  # Load in a csv/excel file for semantic search
165
- in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column, data_state])
166
- load_semantic_data_button.click(parse_csv_or_excel, inputs=[in_semantic_file, data_state, in_semantic_column], outputs=[ingest_text, current_source_semantic, semantic_load_progress]).\
167
  then(csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
168
  then(docs_to_jina_embed_np_array, inputs=[ingest_docs, in_semantic_file, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
169
 
 
28
  out_passages = gr.State(9999)
29
  vec_weight = gr.State(1)
30
 
31
+ #docs_keep_as_doc_state = gr.State()
32
+ #doc_df_state = gr.State()
33
+ #docs_keep_out_state = gr.State()
34
 
35
  corpus_state = gr.State()
36
+ keyword_data_state = gr.State(pd.DataFrame())
37
+ semantic_data_state = gr.State(pd.DataFrame())
38
 
39
  in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
40
  presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
 
59
  """
60
  **Exact term keyword search**
61
 
62
+ 1. Load in data file (ideally a file with '_cleaned' at the end of the name), with (optionally) the '...search_index.pkl.gz' in the same folder to save loading time. 2. Select the field in your data to search. A field with the suffix '_cleaned' means that html tags have been removed. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the relevant box below and press Enter/click on 'Search text'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
63
  """)
64
  with gr.Row():
65
  current_source = gr.Textbox(label="Current data source(s)", value="None")
66
 
67
  with gr.Accordion(label = "Load in data", open=True):
68
+ in_bm25_file = gr.File(label="Upload data for keyword search", file_count= 'multiple', file_types =['.parquet', '.csv', '.pkl', '.pkl.gz'])
69
  with gr.Row():
70
  in_bm25_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
71
  load_bm25_data_button = gr.Button(value="Load data")
 
149
 
150
  ### BM25 SEARCH ###
151
  # Update dropdowns upon initial file load
152
+ in_bm25_file.upload(put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column, keyword_data_state])
153
  in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file, in_join_column], outputs=[in_join_column])
154
 
155
  # Load in BM25 data
156
+ load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, current_source]).\
157
+ then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
158
  #then(fn=put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column])
159
 
160
  # BM25 search functions on click or enter
161
+ keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
162
+ keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
163
 
164
  ### SEMANTIC SEARCH ###
165
  # Load in a csv/excel file for semantic search
166
+ in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column, semantic_data_state])
167
+ load_semantic_data_button.click(parse_csv_or_excel, inputs=[in_semantic_file, semantic_data_state, in_semantic_column], outputs=[ingest_text, current_source_semantic, semantic_load_progress]).\
168
  then(csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
169
  then(docs_to_jina_embed_np_array, inputs=[ingest_docs, in_semantic_file, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
170
 
how_to_create_exe_dist.txt CHANGED
@@ -17,10 +17,10 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
17
  8. In command line, cd to the folder that contains app.py. Then run the following:
18
 
19
  For one single file:
20
- python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.1.1 app.py
21
 
22
  For a small exe with a folder of dependencies:
23
- python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.1.1 app.py
24
 
25
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
26
 
 
17
  8. In command line, cd to the folder that contains app.py. Then run the following:
18
 
19
  For one single file:
20
+ python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.2.1 app.py
21
 
22
  For a small exe with a folder of dependencies:
23
+ python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.2.1 app.py
24
 
25
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
26
 
search_funcs/bm25_functions.py CHANGED
@@ -3,8 +3,10 @@ import heapq
3
  import math
4
  import pickle
5
  import sys
 
6
  import time
7
  import pandas as pd
 
8
  from numpy import inf
9
  import gradio as gr
10
 
@@ -235,7 +237,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
235
 
236
  #print(file_list)
237
 
238
- data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
239
 
240
  data_file_name = data_file_names[0]
241
 
@@ -247,20 +249,24 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
247
  tokenised_df = pd.DataFrame()
248
 
249
  tokenised_file_names = [string.lower() for string in file_list if "tokenised" in string.lower()]
 
 
 
 
 
 
 
 
 
250
 
251
  if tokenised_file_names:
252
  tokenised_df = read_file(tokenised_file_names[0])
253
- #print("Tokenised df is: ", tokenised_df.head())
254
-
255
- #df = pd.read_parquet(file_in.name)
256
-
257
- df[text_column] = df[text_column].astype(str).str.lower()
258
 
259
  if clean == "Yes":
260
  clean_tic = time.perf_counter()
261
  print("Starting data clean.")
262
 
263
- df = df.drop_duplicates(text_column)
264
  df_list = list(df[text_column])
265
  df_list = initial_clean(df_list)
266
 
@@ -336,20 +342,62 @@ def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_col
336
 
337
  return file_name, new_text_column
338
 
339
- def prepare_bm25(corpus, k1=1.5, b = 0.75, alpha=-5):
340
- #bm25.save("saved_df_bm25")
341
- #bm25 = BM25.load(re.sub(r'\.pkl$', '', file_in.name))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
 
343
- print("Preparing BM25 corpus")
344
 
345
- global bm25
346
- bm25 = BM25(corpus, k1=k1, b=b, alpha=alpha)
347
 
348
- message = "Search parameters loaded."
349
 
350
- print(message)
351
 
352
- return message
353
 
354
  def convert_bm25_query_to_tokens(free_text_query, clean="No"):
355
  '''
@@ -418,8 +466,8 @@ def bm25_search(free_text_query, in_no_search_results, original_data, text_colum
418
 
419
  # Out file
420
  query_str_file = ("_").join(token_query)
421
- results_df_name = "keyword_search_result_" + today_rev + "_" + query_str_file + ".csv"
422
- results_df_out.to_csv(results_df_name, index= None)
423
  results_first_text = results_df_out[text_column].iloc[0]
424
 
425
  print("Returning results")
 
3
  import math
4
  import pickle
5
  import sys
6
+ import gzip
7
  import time
8
  import pandas as pd
9
+ import numpy as np
10
  from numpy import inf
11
  import gradio as gr
12
 
 
237
 
238
  #print(file_list)
239
 
240
+ data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
241
 
242
  data_file_name = data_file_names[0]
243
 
 
249
  tokenised_df = pd.DataFrame()
250
 
251
  tokenised_file_names = [string.lower() for string in file_list if "tokenised" in string.lower()]
252
+ search_index_file_names = [string.lower() for string in file_list if "gz" in string.lower()]
253
+
254
+ df[text_column] = df[text_column].astype(str).str.lower()
255
+
256
+ if search_index_file_names:
257
+ corpus = list(df[text_column])
258
+ message = "Tokenisation skipped - loading search index from file."
259
+ print(message)
260
+ return corpus, message, df, None, None, None
261
 
262
  if tokenised_file_names:
263
  tokenised_df = read_file(tokenised_file_names[0])
 
 
 
 
 
264
 
265
  if clean == "Yes":
266
  clean_tic = time.perf_counter()
267
  print("Starting data clean.")
268
 
269
+ #df = df.drop_duplicates(text_column)
270
  df_list = list(df[text_column])
271
  df_list = initial_clean(df_list)
272
 
 
342
 
343
  return file_name, new_text_column
344
 
345
+ def prepare_bm25(corpus, in_file, return_intermediate_files, k1=1.5, b = 0.75, alpha=-5):
346
+ #bm25.save("saved_df_bm25")
347
+ #bm25 = BM25.load(re.sub(r'\.pkl$', '', file_in.name))
348
+
349
+ file_list = [string.name for string in in_file]
350
+
351
+ #print(file_list)
352
+
353
+ # Get data file name
354
+ data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
355
+
356
+ data_file_name = data_file_names[0]
357
+ data_file_out_name = get_file_path_end_with_ext(data_file_name)
358
+ data_file_name_no_ext = get_file_path_end(data_file_name)
359
+
360
+ # Check if there is a search index file already
361
+ index_file_names = [string.lower() for string in file_list if "gz" in string.lower()]
362
+
363
+
364
+ if index_file_names:
365
+ index_file_name = index_file_names[0]
366
+
367
+ print(index_file_name)
368
+
369
+ bm25_load = read_file(index_file_name)
370
+
371
+
372
+ #index_file_out_name = get_file_path_end_with_ext(index_file_name)
373
+ #index_file_name_no_ext = get_file_path_end(index_file_name)
374
+
375
+ else:
376
+ print("Preparing BM25 corpus")
377
+
378
+ bm25_load = BM25(corpus, k1=k1, b=b, alpha=alpha)
379
+
380
+ global bm25
381
+ bm25 = bm25_load
382
+
383
+ if return_intermediate_files == "Yes":
384
+ bm25_search_file_name = data_file_name_no_ext + '_' + 'search_index.pkl.gz'
385
+ #np.savez_compressed(bm25_search_file_name, bm25)
386
+
387
+ with gzip.open(bm25_search_file_name, 'wb') as file:
388
+ pickle.dump(bm25, file)
389
+
390
+ print("Search index saved to file")
391
 
392
+ message = "Search parameters loaded."
393
 
394
+ return message, bm25_search_file_name
 
395
 
396
+ message = "Search parameters loaded."
397
 
398
+ print(message)
399
 
400
+ return message, None
401
 
402
  def convert_bm25_query_to_tokens(free_text_query, clean="No"):
403
  '''
 
466
 
467
  # Out file
468
  query_str_file = ("_").join(token_query)
469
+ results_df_name = "keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
470
+ results_df_out.to_excel(results_df_name, index= None)
471
  results_first_text = results_df_out[text_column].iloc[0]
472
 
473
  print("Returning results")
search_funcs/helper_functions.py CHANGED
@@ -2,10 +2,6 @@ import os
2
  import re
3
  import pandas as pd
4
  import gradio as gr
5
-
6
- import os
7
- import shutil
8
-
9
  import os
10
  import shutil
11
  import getpass
@@ -35,7 +31,6 @@ def empty_folder(directory_path):
35
 
36
 
37
 
38
-
39
  def get_file_path_end(file_path):
40
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
41
  basename = os.path.basename(file_path)
@@ -64,6 +59,8 @@ def detect_file_type(filename):
64
  return 'parquet'
65
  elif filename.endswith('.pkl.gz'):
66
  return 'pkl.gz'
 
 
67
  else:
68
  raise ValueError("Unsupported file type.")
69
 
@@ -82,7 +79,9 @@ def read_file(filename):
82
  elif file_type == 'pkl.gz':
83
  with gzip.open(filename, 'rb') as file:
84
  file = pickle.load(file)
85
- #file = pd.read_pickle(filename)
 
 
86
 
87
  print("File load complete")
88
 
 
2
  import re
3
  import pandas as pd
4
  import gradio as gr
 
 
 
 
5
  import os
6
  import shutil
7
  import getpass
 
31
 
32
 
33
 
 
34
  def get_file_path_end(file_path):
35
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
36
  basename = os.path.basename(file_path)
 
59
  return 'parquet'
60
  elif filename.endswith('.pkl.gz'):
61
  return 'pkl.gz'
62
+ #elif filename.endswith('.gz'):
63
+ # return 'gz'
64
  else:
65
  raise ValueError("Unsupported file type.")
66
 
 
79
  elif file_type == 'pkl.gz':
80
  with gzip.open(filename, 'rb') as file:
81
  file = pickle.load(file)
82
+ #elif file_type == ".gz":
83
+ # with gzip.open(filename, 'rb') as file:
84
+ # file = pickle.load(file)
85
 
86
  print("File load complete")
87
 
search_funcs/semantic_functions.py CHANGED
@@ -96,10 +96,10 @@ def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "
96
  ## Load in pre-embedded file if exists
97
  file_list = [string.name for string in in_file]
98
 
99
- print(file_list)
100
 
101
- embeddings_file_names = [string.lower() for string in file_list if "npz" in string.lower()]
102
- data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
103
  data_file_name = data_file_names[0]
104
  data_file_name_no_ext = get_file_path_end(data_file_name)
105
 
@@ -283,8 +283,8 @@ def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_v
283
 
284
  query_str_file = query_str.replace(" ", "_")
285
 
286
- results_df_name = "semantic_search_result_" + today_rev + "_" + query_str_file + ".csv"
287
- results_df_out.to_csv(results_df_name, index= None)
288
  results_first_text = results_df_out.iloc[0, 1]
289
 
290
  return results_first_text, results_df_name
 
96
  ## Load in pre-embedded file if exists
97
  file_list = [string.name for string in in_file]
98
 
99
+ #print(file_list)
100
 
101
+ embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
102
+ data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
103
  data_file_name = data_file_names[0]
104
  data_file_name_no_ext = get_file_path_end(data_file_name)
105
 
 
283
 
284
  query_str_file = query_str.replace(" ", "_")
285
 
286
+ results_df_name = "semantic_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
287
+ results_df_out.to_excel(results_df_name, index= None)
288
  results_first_text = results_df_out.iloc[0, 1]
289
 
290
  return results_first_text, results_df_name
search_funcs/semantic_ingest_functions.py CHANGED
@@ -130,7 +130,7 @@ def parse_csv_or_excel(file_path, data_state, text_column = "text"):
130
 
131
  #print(file_list)
132
 
133
- data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
134
 
135
  data_file_name = data_file_names[0]
136
 
@@ -329,7 +329,7 @@ def csv_excel_text_to_docs(df, in_file, text_column='text', clean = "No", return
329
  clean_tic = time.perf_counter()
330
  print("Starting data clean.")
331
 
332
- df = df.drop_duplicates(text_column)
333
 
334
  df[text_column] = initial_clean(df[text_column])
335
  df_list = list(df[text_column])
 
130
 
131
  #print(file_list)
132
 
133
+ data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
134
 
135
  data_file_name = data_file_names[0]
136
 
 
329
  clean_tic = time.perf_counter()
330
  print("Starting data clean.")
331
 
332
+ #df = df.drop_duplicates(text_column)
333
 
334
  df[text_column] = initial_clean(df[text_column])
335
  df_list = list(df[text_column])