seanpedrickcase commited on
Commit
650da6e
1 Parent(s): 58d3f97

Improvements with embeddings load and file save

Browse files
app.py CHANGED
@@ -7,7 +7,7 @@ PandasDataFrame = Type[pd.DataFrame]
7
 
8
  from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
9
  from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
10
- from search_funcs.semantic_functions import load_embedding_model, docs_to_bge_embed_np_array, bge_semantic_search
11
  from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_connection_params, output_folder, get_or_create_env_var # Not currently used: get_temp_folder_path, empty_folder,
12
  from search_funcs.spacy_search_funcs import spacy_fuzzy_search
13
  from search_funcs.aws_functions import load_data_from_aws
@@ -99,7 +99,7 @@ depends on factors such as the type of documents or queries. Information taken f
99
  """
100
  **Thematic/semantic search**
101
 
102
- This search type enables you to search for general terms (e.g. happiness, nature) and the search will pick out text passages that are most semantically similar to them. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
103
  """)
104
 
105
  with gr.Row():
@@ -202,7 +202,7 @@ depends on factors such as the type of documents or queries. Information taken f
202
 
203
  load_semantic_data_button.click(
204
  csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[semantic_input_document_format, semantic_load_progress, output_file_state], api_name="convert_texts_to_documents").\
205
- then(docs_to_bge_embed_np_array, inputs=[semantic_input_document_format, in_semantic_file, output_file_state, in_clean_data, embeddings_state, embeddings_model_name_state, embeddings_model_loc_state, return_intermediate_files, embeddings_compress], outputs=[semantic_load_progress, embeddings_state, semantic_output_file, output_file_state, embeddings_model_state], api_name="embed_documents")
206
 
207
  # Semantic search query
208
  semantic_submit.click(bge_semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, embeddings_model_state, embeddings_model_name_state, embeddings_compress, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic_search")
 
7
 
8
  from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
9
  from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
10
+ from search_funcs.semantic_functions import load_embedding_model, docs_to_embed_np_array, bge_semantic_search
11
  from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_connection_params, output_folder, get_or_create_env_var # Not currently used: get_temp_folder_path, empty_folder,
12
  from search_funcs.spacy_search_funcs import spacy_fuzzy_search
13
  from search_funcs.aws_functions import load_data_from_aws
 
99
  """
100
  **Thematic/semantic search**
101
 
102
+ This search type enables you to search for general terms (e.g. happiness, nature) and the search will pick out text passages that are most semantically similar to them. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embed... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
103
  """)
104
 
105
  with gr.Row():
 
202
 
203
  load_semantic_data_button.click(
204
  csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[semantic_input_document_format, semantic_load_progress, output_file_state], api_name="convert_texts_to_documents").\
205
+ then(docs_to_embed_np_array, inputs=[semantic_input_document_format, in_semantic_file, output_file_state, in_clean_data, embeddings_state, embeddings_model_name_state, embeddings_model_loc_state, return_intermediate_files, embeddings_compress], outputs=[semantic_load_progress, embeddings_state, semantic_output_file, output_file_state, embeddings_model_state], api_name="embed_documents")
206
 
207
  # Semantic search query
208
  semantic_submit.click(bge_semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, embeddings_model_state, embeddings_model_name_state, embeddings_compress, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic_search")
search_funcs/helper_functions.py CHANGED
@@ -261,13 +261,13 @@ def initial_data_load(in_file:List[str], progress = gr.Progress(track_tqdm=True)
261
 
262
  progress(0.3, desc="Loading in data files")
263
 
264
- data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "search_index" not in string.lower()]
265
  print("Data file names:", data_file_names)
266
 
267
  if not data_file_names:
268
  out_message = "Please load in at least one csv/Excel/parquet data file."
269
  print(out_message)
270
- return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), pd.DataFrame(), index_load, embed_load, tokenised_load, out_message, None
271
 
272
  # This if you have loaded in a documents object for the semantic search
273
  if "pkl" in data_file_names[0]:
@@ -288,11 +288,9 @@ def initial_data_load(in_file:List[str], progress = gr.Progress(track_tqdm=True)
288
  if file_size > file_size_bytes_500mb:
289
  out_message = "Data file greater than 500mb in size. Please use smaller sizes."
290
  print(out_message)
291
- return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), pd.DataFrame(), index_load, embed_load, tokenised_load, out_message, None
292
-
293
 
294
  df_new = read_file(file)
295
-
296
  df = pd.concat([df, df_new], ignore_index = True)
297
 
298
  new_choices = list(df.columns)
@@ -302,22 +300,22 @@ def initial_data_load(in_file:List[str], progress = gr.Progress(track_tqdm=True)
302
  progress(0.6, desc="Loading in embedding/search index files")
303
 
304
  # Check if there is a search index file already
305
- index_file_names = [string for string in file_list if ".gz" in string.lower()]
306
 
307
  if index_file_names:
308
  index_file_name = index_file_names[0]
309
  print("Search index file name found:", index_file_name)
310
  index_load = read_file(index_file_name)
311
 
312
- embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
313
 
314
  if embeddings_file_names:
315
  print("Loading embeddings from file.")
316
  embed_load = np.load(embeddings_file_names[0])['arr_0']
317
 
318
  # If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
319
- if "compress" in embeddings_file_names[0]:
320
- embed_load /= 100
321
  else:
322
  embed_load = np.array([])
323
 
 
261
 
262
  progress(0.3, desc="Loading in data files")
263
 
264
+ data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() or "prep_docs" in string.lower()]
265
  print("Data file names:", data_file_names)
266
 
267
  if not data_file_names:
268
  out_message = "Please load in at least one csv/Excel/parquet data file."
269
  print(out_message)
270
+ return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), pd.DataFrame(), index_load, embed_load, tokenised_load, out_message, None, file_list
271
 
272
  # This if you have loaded in a documents object for the semantic search
273
  if "pkl" in data_file_names[0]:
 
288
  if file_size > file_size_bytes_500mb:
289
  out_message = "Data file greater than 500mb in size. Please use smaller sizes."
290
  print(out_message)
291
+ return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), pd.DataFrame(), index_load, embed_load, tokenised_load, out_message, None, file_list
 
292
 
293
  df_new = read_file(file)
 
294
  df = pd.concat([df, df_new], ignore_index = True)
295
 
296
  new_choices = list(df.columns)
 
300
  progress(0.6, desc="Loading in embedding/search index files")
301
 
302
  # Check if there is a search index file already
303
+ index_file_names = [string for string in file_list if "pkl.gz" in string.lower()]
304
 
305
  if index_file_names:
306
  index_file_name = index_file_names[0]
307
  print("Search index file name found:", index_file_name)
308
  index_load = read_file(index_file_name)
309
 
310
+ embeddings_file_names = [string for string in file_list if ".npz" in string.lower()]
311
 
312
  if embeddings_file_names:
313
  print("Loading embeddings from file.")
314
  embed_load = np.load(embeddings_file_names[0])['arr_0']
315
 
316
  # If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
317
+ #if "compress" in embeddings_file_names[0]:
318
+ # embed_load /= 100
319
  else:
320
  embed_load = np.array([])
321
 
search_funcs/semantic_functions.py CHANGED
@@ -50,7 +50,7 @@ def load_embedding_model(embeddings_name = "BAAI/bge-small-en-v1.5", embedding_l
50
 
51
  return embeddings_model, torch_device
52
 
53
- def docs_to_bge_embed_np_array(
54
  docs_out: list,
55
  in_file: list,
56
  output_file_state: str,
@@ -136,9 +136,9 @@ def docs_to_bge_embed_np_array(
136
 
137
  progress(0.9, desc = "Saving embeddings to file")
138
  if embeddings_compress == "No":
139
- semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embeddings.npz'
140
  else:
141
- semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embedding_compress.npz'
142
 
143
  np.savez_compressed(semantic_search_file_name, embeddings_out)
144
 
 
50
 
51
  return embeddings_model, torch_device
52
 
53
+ def docs_to_embed_np_array(
54
  docs_out: list,
55
  in_file: list,
56
  output_file_state: str,
 
136
 
137
  progress(0.9, desc = "Saving embeddings to file")
138
  if embeddings_compress == "No":
139
+ semantic_search_file_name = output_folder + data_file_name_no_ext + '_embeddings.npz'
140
  else:
141
+ semantic_search_file_name = output_folder + data_file_name_no_ext + '_embedding_compress.npz'
142
 
143
  np.savez_compressed(semantic_search_file_name, embeddings_out)
144
 
search_funcs/semantic_ingest_functions.py CHANGED
@@ -97,6 +97,11 @@ def csv_excel_text_to_docs(df:PandasDataFrame, in_file:List[str], text_column:st
97
  ensure_output_folder_exists(output_folder)
98
  output_list = []
99
 
 
 
 
 
 
100
  if not in_file:
101
  return None, "Please load in at least one file.", output_list
102
 
@@ -115,12 +120,12 @@ def csv_excel_text_to_docs(df:PandasDataFrame, in_file:List[str], text_column:st
115
  data_file_name = data_file_names[0]
116
 
117
  # Check if file is a document format, and explode out as needed
118
- if "prepared_docs" in data_file_name:
119
  print("Loading in documents from file.")
120
 
121
  doc_sections = df
122
 
123
- # Convert each element in the Series to a Document instance
124
 
125
  return doc_sections, "Finished preparing documents", output_list
126
 
@@ -147,17 +152,29 @@ def csv_excel_text_to_docs(df:PandasDataFrame, in_file:List[str], text_column:st
147
  clean_toc = time.perf_counter()
148
  clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
149
  print(clean_time_out)
 
 
 
 
150
 
151
- cols = [col for col in df.columns if col != original_text_column]
 
 
152
 
 
153
  df["metadata"] = combine_metadata_columns(df, cols)
154
 
155
  progress(0.3, desc = "Converting data to document format")
156
 
 
 
 
 
157
  # Create a list of Document objects
158
- doc_sections = [Document(page_content=row[text_column],
159
- metadata= parse_metadata(row["metadata"]))
160
  for index, row in progress.tqdm(df.iterrows(), desc = "Splitting up text", unit = "rows")]
 
 
161
 
162
  ingest_toc = time.perf_counter()
163
 
@@ -169,15 +186,11 @@ def csv_excel_text_to_docs(df:PandasDataFrame, in_file:List[str], text_column:st
169
  data_file_out_name_no_ext = get_file_path_end(data_file_name)
170
  file_name = data_file_out_name_no_ext
171
 
172
- if clean == "No":
173
- out_doc_file_name = output_folder + file_name + "_prepared_docs.pkl.gz"
174
- with gzip.open(out_doc_file_name, 'wb') as file:
175
- pickle.dump(doc_sections, file)
176
 
177
- elif clean == "Yes":
178
- out_doc_file_name = output_folder + file_name + "_cleaned_prepared_docs.pkl.gz"
179
- with gzip.open(out_doc_file_name, 'wb') as file:
180
- pickle.dump(doc_sections, file)
181
 
182
  output_list.append(out_doc_file_name)
183
  print("Documents saved to file.")
 
97
  ensure_output_folder_exists(output_folder)
98
  output_list = []
99
 
100
+ if not isinstance(text_column, str):
101
+ text_column = str(text_column)
102
+
103
+ print("text_column:", text_column)
104
+
105
  if not in_file:
106
  return None, "Please load in at least one file.", output_list
107
 
 
120
  data_file_name = data_file_names[0]
121
 
122
  # Check if file is a document format, and explode out as needed
123
+ if "prep_docs" in data_file_name:
124
  print("Loading in documents from file.")
125
 
126
  doc_sections = df
127
 
128
+ print("doc_sections:", doc_sections[0])
129
 
130
  return doc_sections, "Finished preparing documents", output_list
131
 
 
152
  clean_toc = time.perf_counter()
153
  clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
154
  print(clean_time_out)
155
+
156
+ else:
157
+ df_list = list(df[text_column])
158
+ prepared_text_df = pd.DataFrame(data={text_column:df_list})
159
 
160
+ # Drop original column from input file to reduce file size
161
+ in_df = df.drop(text_column, axis = 1)
162
+ df = pd.concat([in_df, prepared_text_df], axis = 1)
163
 
164
+ cols = [col for col in df.columns if col != original_text_column]
165
  df["metadata"] = combine_metadata_columns(df, cols)
166
 
167
  progress(0.3, desc = "Converting data to document format")
168
 
169
+ #print("text_column name:", text_column)
170
+ #print("text_column:", df[text_column])
171
+ #print("metadata", df["metadata"])
172
+
173
  # Create a list of Document objects
174
+ doc_sections = [Document(page_content=row[text_column], metadata= parse_metadata(row["metadata"]))
 
175
  for index, row in progress.tqdm(df.iterrows(), desc = "Splitting up text", unit = "rows")]
176
+
177
+ print("doc_sections:", doc_sections[0])
178
 
179
  ingest_toc = time.perf_counter()
180
 
 
186
  data_file_out_name_no_ext = get_file_path_end(data_file_name)
187
  file_name = data_file_out_name_no_ext
188
 
189
+ if clean == "No": out_doc_file_name = output_folder + file_name + "_prep_docs.pkl.gz"
190
+ elif clean == "Yes": out_doc_file_name = output_folder + file_name + "_cleaned_prep_docs.pkl.gz"
 
 
191
 
192
+ with gzip.open(out_doc_file_name, 'wb') as file:
193
+ pickle.dump(doc_sections, file)
 
 
194
 
195
  output_list.append(out_doc_file_name)
196
  print("Documents saved to file.")