seanpedrickcase commited on
Commit
fea085c
1 Parent(s): d3ff2e2

Changed all intermediate file outputs to save to output folder

Browse files
app.py CHANGED
@@ -12,11 +12,6 @@ from search_funcs.helper_functions import display_info, initial_data_load, put_c
12
  from search_funcs.spacy_search_funcs import spacy_fuzzy_search
13
  from search_funcs.aws_functions import load_data_from_aws
14
 
15
- #from fastapi import FastAPI
16
- #app = FastAPI()
17
-
18
-
19
-
20
  # Attempt to delete temporary files generated by previous use of the app (as the files can be very big!). Only setup to work for local runs in Windows
21
  temp_folder_path = get_temp_folder_path()
22
  empty_folder(temp_folder_path)
@@ -104,8 +99,7 @@ depends on factors such as the type of documents or queries. Information taken f
104
  **Thematic/semantic search**
105
 
106
  This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
107
- """)
108
-
109
 
110
  with gr.Row():
111
  current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
 
12
  from search_funcs.spacy_search_funcs import spacy_fuzzy_search
13
  from search_funcs.aws_functions import load_data_from_aws
14
 
 
 
 
 
 
15
  # Attempt to delete temporary files generated by previous use of the app (as the files can be very big!). Only setup to work for local runs in Windows
16
  temp_folder_path = get_temp_folder_path()
17
  empty_folder(temp_folder_path)
 
99
  **Thematic/semantic search**
100
 
101
  This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
102
+ """)
 
103
 
104
  with gr.Row():
105
  current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
search_funcs/bm25_functions.py CHANGED
@@ -220,12 +220,12 @@ class BM25:
220
  return list(indices), docs, list(scores)
221
 
222
  def save(self, filename):
223
- with open(f"{filename}.pkl", "wb") as fsave:
224
  pickle.dump(self, fsave, protocol=pickle.HIGHEST_PROTOCOL)
225
 
226
  @staticmethod
227
  def load(filename):
228
- with open(f"{filename}.pkl", "rb") as fsave:
229
  return pickle.load(fsave)
230
 
231
  # These following functions are my own work
@@ -432,9 +432,9 @@ def prepare_bm25(corpus, in_file, text_column, search_index, clean, return_inter
432
  progress(0.8, desc = "Saving search index to file")
433
 
434
  if clean == "Yes":
435
- bm25_search_file_name = data_file_name_no_ext + '_cleaned_search_index.pkl.gz'
436
  else:
437
- bm25_search_file_name = data_file_name_no_ext + '_search_index.pkl.gz'
438
  #np.savez_compressed(bm25_search_file_name, bm25)
439
 
440
  with gzip.open(bm25_search_file_name, 'wb') as file:
 
220
  return list(indices), docs, list(scores)
221
 
222
  def save(self, filename):
223
+ with open(f"{output_folder}{filename}.pkl", "wb") as fsave:
224
  pickle.dump(self, fsave, protocol=pickle.HIGHEST_PROTOCOL)
225
 
226
  @staticmethod
227
  def load(filename):
228
+ with open(f"{output_folder}{filename}.pkl", "rb") as fsave:
229
  return pickle.load(fsave)
230
 
231
  # These following functions are my own work
 
432
  progress(0.8, desc = "Saving search index to file")
433
 
434
  if clean == "Yes":
435
+ bm25_search_file_name = output_folder + data_file_name_no_ext + '_cleaned_search_index.pkl.gz'
436
  else:
437
+ bm25_search_file_name = output_folder + data_file_name_no_ext + '_search_index.pkl.gz'
438
  #np.savez_compressed(bm25_search_file_name, bm25)
439
 
440
  with gzip.open(bm25_search_file_name, 'wb') as file:
search_funcs/helper_functions.py CHANGED
@@ -231,11 +231,6 @@ def put_columns_in_join_df(in_file):
231
  return gr.Dropdown(choices=concat_choices), new_df, out_message
232
 
233
 
234
- """
235
- A dummy function that exists just so that dropdown updates work correctly.
236
- """
237
- return None
238
-
239
  def display_info(info_component):
240
  gr.Info(info_component)
241
 
 
231
  return gr.Dropdown(choices=concat_choices), new_df, out_message
232
 
233
 
 
 
 
 
 
234
  def display_info(info_component):
235
  gr.Info(info_component)
236
 
search_funcs/semantic_ingest_functions.py CHANGED
@@ -68,9 +68,7 @@ def parse_file_not_used(file_paths, text_column='text'):
68
  file_names = []
69
 
70
  for file_path in file_paths:
71
- #print(file_path.name)
72
- #file = open(file_path.name, 'r')
73
- #print(file)
74
  file_extension = detect_file_type(file_path.name)
75
  if file_extension in extension_to_parser:
76
  parsed_contents[file_path.name] = extension_to_parser[file_extension](file_path.name)
@@ -222,19 +220,11 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
222
  if "prepared_docs" in data_file_name:
223
  print("Loading in documents from file.")
224
 
225
- #print(df[0:5])
226
- #section_series = df.iloc[:,0]
227
- #section_series = "{" + section_series + "}"
228
-
229
  doc_sections = df
230
 
231
- #print(doc_sections[0])
232
-
233
  # Convert each element in the Series to a Document instance
234
- #doc_sections = section_series.apply(lambda x: Document(**x))
235
 
236
  return doc_sections, "Finished preparing documents", output_list
237
- # df = document_to_dataframe(df.iloc[:,0])
238
 
239
  ingest_tic = time.perf_counter()
240
 
@@ -248,16 +238,9 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
248
  clean_tic = time.perf_counter()
249
  print("Starting data clean.")
250
 
251
- #df = df.drop_duplicates(text_column)
252
-
253
  df_list = list(df[text_column])
254
  df_list = initial_clean(df_list)
255
 
256
- # Get rid of old data and keep only the new
257
- #df = df.drop(text_column, axis = 1)
258
-
259
-
260
-
261
  # Save to file if you have cleaned the data. Text column has now been renamed with '_cleaned' at the send
262
  out_file_name, text_column, df = save_prepared_bm25_data(data_file_name, df_list, df, text_column)
263
 
@@ -272,13 +255,6 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
272
 
273
  df["metadata"] = combine_metadata_columns(df, cols)
274
 
275
- #df = df.rename(columns={text_column:"page_content"})
276
-
277
- #print(df[["page_content", "metadata"]].to_dict(orient='records'))
278
-
279
- #doc_sections = df[["page_content", "metadata"]].to_dict(orient='records')
280
- #doc_sections = [Document(**row) for row in df[["page_content", "metadata"]].to_dict(orient='records')]
281
-
282
  progress(0.3, desc = "Converting data to document format")
283
 
284
  # Create a list of Document objects
@@ -295,29 +271,17 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
295
  progress(0.5, desc = "Saving prepared documents")
296
  data_file_out_name_no_ext = get_file_path_end(data_file_name)
297
  file_name = data_file_out_name_no_ext
298
- #print(doc_sections)
299
- #page_content_series_string = pd.Series(doc_sections).astype(str)
300
- #page_content_series_string = page_content_series_string.str.replace(" type='Document'", "").str.replace("' metadata=", "', 'metadata':").str.replace("page_content=", "{'page_content':")
301
- #page_content_series_string = page_content_series_string + "}"
302
- #print(page_content_series_string[0])
303
- #metadata_series_string = pd.Series(doc_sections[1]).astype(str)
304
-
305
 
306
  if clean == "No":
307
- #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
308
  out_doc_file_name = output_folder + file_name + "_prepared_docs.pkl.gz"
309
  with gzip.open(out_doc_file_name, 'wb') as file:
310
  pickle.dump(doc_sections, file)
311
 
312
- #pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs.pkl")
313
  elif clean == "Yes":
314
- #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
315
-
316
  out_doc_file_name = output_folder + file_name + "_cleaned_prepared_docs.pkl.gz"
317
  with gzip.open(out_doc_file_name, 'wb') as file:
318
  pickle.dump(doc_sections, file)
319
 
320
- #pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs_clean.pkl")
321
  output_list.append(out_doc_file_name)
322
  print("Documents saved to file.")
323
 
 
68
  file_names = []
69
 
70
  for file_path in file_paths:
71
+
 
 
72
  file_extension = detect_file_type(file_path.name)
73
  if file_extension in extension_to_parser:
74
  parsed_contents[file_path.name] = extension_to_parser[file_extension](file_path.name)
 
220
  if "prepared_docs" in data_file_name:
221
  print("Loading in documents from file.")
222
 
 
 
 
 
223
  doc_sections = df
224
 
 
 
225
  # Convert each element in the Series to a Document instance
 
226
 
227
  return doc_sections, "Finished preparing documents", output_list
 
228
 
229
  ingest_tic = time.perf_counter()
230
 
 
238
  clean_tic = time.perf_counter()
239
  print("Starting data clean.")
240
 
 
 
241
  df_list = list(df[text_column])
242
  df_list = initial_clean(df_list)
243
 
 
 
 
 
 
244
  # Save to file if you have cleaned the data. Text column has now been renamed with '_cleaned' at the send
245
  out_file_name, text_column, df = save_prepared_bm25_data(data_file_name, df_list, df, text_column)
246
 
 
255
 
256
  df["metadata"] = combine_metadata_columns(df, cols)
257
 
 
 
 
 
 
 
 
258
  progress(0.3, desc = "Converting data to document format")
259
 
260
  # Create a list of Document objects
 
271
  progress(0.5, desc = "Saving prepared documents")
272
  data_file_out_name_no_ext = get_file_path_end(data_file_name)
273
  file_name = data_file_out_name_no_ext
 
 
 
 
 
 
 
274
 
275
  if clean == "No":
 
276
  out_doc_file_name = output_folder + file_name + "_prepared_docs.pkl.gz"
277
  with gzip.open(out_doc_file_name, 'wb') as file:
278
  pickle.dump(doc_sections, file)
279
 
 
280
  elif clean == "Yes":
 
 
281
  out_doc_file_name = output_folder + file_name + "_cleaned_prepared_docs.pkl.gz"
282
  with gzip.open(out_doc_file_name, 'wb') as file:
283
  pickle.dump(doc_sections, file)
284
 
 
285
  output_list.append(out_doc_file_name)
286
  print("Documents saved to file.")
287
 
search_funcs/spacy_search_funcs.py CHANGED
@@ -7,7 +7,7 @@ import gradio as gr
7
  import pandas as pd
8
  from typing import List, Type
9
  from datetime import datetime
10
- from search_funcs.helper_functions import create_highlighted_excel_wb
11
 
12
  PandasDataFrame = Type[pd.DataFrame]
13
 
@@ -106,7 +106,7 @@ def spacy_fuzzy_search(string_query:str, df_list: List[str], original_data: Pand
106
 
107
  # Out file
108
  query_str_file = ("_").join(tokenised_query)
109
- results_df_name = "keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
110
 
111
  print("Saving search file output")
112
  progress(0.7, desc = "Saving search output to file")
 
7
  import pandas as pd
8
  from typing import List, Type
9
  from datetime import datetime
10
+ from search_funcs.helper_functions import create_highlighted_excel_wb, output_folder
11
 
12
  PandasDataFrame = Type[pd.DataFrame]
13
 
 
106
 
107
  # Out file
108
  query_str_file = ("_").join(tokenised_query)
109
+ results_df_name = output_folder + "fuzzy_keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
110
 
111
  print("Saving search file output")
112
  progress(0.7, desc = "Saving search output to file")