Sean-Case commited on
Commit
63049fe
β€’
1 Parent(s): 3df8e40

Better error checking. Doesn't load in embeddings file twice now.

Browse files
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: πŸ”
4
  colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 3.50.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
4
  colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.16.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py CHANGED
@@ -6,6 +6,7 @@ from search_funcs.helper_functions import dummy_function, display_info, put_colu
6
 
7
  import gradio as gr
8
  import pandas as pd
 
9
 
10
  PandasDataFrame = Type[pd.DataFrame]
11
 
@@ -22,18 +23,16 @@ with block:
22
  ingest_metadata = gr.State()
23
  ingest_docs = gr.State()
24
  vectorstore_state = gr.State() # globals()["vectorstore"]
25
- embeddings_state = gr.State() # globals()["embeddings"]
 
26
 
27
  k_val = gr.State(9999)
28
  out_passages = gr.State(9999)
29
  vec_weight = gr.State(1)
30
 
31
- #docs_keep_as_doc_state = gr.State()
32
- #doc_df_state = gr.State()
33
- #docs_keep_out_state = gr.State()
34
-
35
  corpus_state = gr.State()
36
  keyword_data_state = gr.State(pd.DataFrame())
 
37
  semantic_data_state = gr.State(pd.DataFrame())
38
 
39
  in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
@@ -76,7 +75,6 @@ depends on factors such as the type of documents or queries. Information taken f
76
  with gr.Accordion(label = "Search data", open=True):
77
  with gr.Row():
78
  keyword_query = gr.Textbox(label="Enter your search term")
79
- #mod_query = gr.Textbox(label="Cleaned search term (the terms that are passed to the search engine)")
80
 
81
  keyword_search_button = gr.Button(value="Search text")
82
 
@@ -115,7 +113,7 @@ depends on factors such as the type of documents or queries. Information taken f
115
  with gr.Row():
116
  in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
117
  return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
118
- embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
119
  #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
120
  with gr.Accordion(label="Keyword search options", open = False):
121
  with gr.Row():
@@ -133,13 +131,14 @@ depends on factors such as the type of documents or queries. Information taken f
133
  with gr.Row():
134
  in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
135
  with gr.Accordion(label="Semantic search options", open = False):
136
- semantic_min_distance = gr.Slider(label = "Minimum distance score for search result to be included", value = 0.7, minimum=0, maximum=0.95, step=0.01)
137
  with gr.Accordion(label = "Join on additional dataframes to results", open = False):
138
  in_join_file = gr.File(label="Upload your data to join here")
 
139
  in_join_column = gr.Dropdown(label="Column to join in new data frame")
140
  search_df_join_column = gr.Dropdown(label="Column to join in search data frame")
141
 
142
- in_search_param_button.click(fn=prepare_bm25, inputs=[corpus_state, in_k1, in_b, in_alpha], outputs=[load_finished_message])
143
 
144
  # ---
145
  in_k1_button.click(display_info, inputs=in_k1_info)
@@ -149,28 +148,27 @@ depends on factors such as the type of documents or queries. Information taken f
149
 
150
  ### BM25 SEARCH ###
151
  # Update dropdowns upon initial file load
152
- in_bm25_file.upload(put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column, keyword_data_state])
153
- in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file, in_join_column], outputs=[in_join_column])
154
 
155
  # Load in BM25 data
156
  load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, current_source]).\
157
- then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
158
- #then(fn=put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column])
159
-
160
  # BM25 search functions on click or enter
161
- keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
162
- keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
163
 
164
  ### SEMANTIC SEARCH ###
165
  # Load in a csv/excel file for semantic search
166
- in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column, semantic_data_state])
167
  load_semantic_data_button.click(parse_csv_or_excel, inputs=[in_semantic_file, semantic_data_state, in_semantic_column], outputs=[ingest_text, current_source_semantic, semantic_load_progress]).\
168
  then(csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
169
- then(docs_to_jina_embed_np_array, inputs=[ingest_docs, in_semantic_file, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
170
 
171
  # Semantic search query
172
- semantic_submit.click(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, in_join_file, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
173
- semantic_query.submit(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, in_join_file, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
174
 
175
  # Dummy functions just to get dropdowns to work correctly with Gradio 3.50
176
  in_bm25_column.change(dummy_function, in_bm25_column, None)
 
6
 
7
  import gradio as gr
8
  import pandas as pd
9
+ import numpy as np
10
 
11
  PandasDataFrame = Type[pd.DataFrame]
12
 
 
23
  ingest_metadata = gr.State()
24
  ingest_docs = gr.State()
25
  vectorstore_state = gr.State() # globals()["vectorstore"]
26
+ embeddings_state = gr.State(np.array([])) # globals()["embeddings"]
27
+ search_index_state = gr.State()
28
 
29
  k_val = gr.State(9999)
30
  out_passages = gr.State(9999)
31
  vec_weight = gr.State(1)
32
 
 
 
 
 
33
  corpus_state = gr.State()
34
  keyword_data_state = gr.State(pd.DataFrame())
35
+ join_data_state = gr.State(pd.DataFrame())
36
  semantic_data_state = gr.State(pd.DataFrame())
37
 
38
  in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
 
75
  with gr.Accordion(label = "Search data", open=True):
76
  with gr.Row():
77
  keyword_query = gr.Textbox(label="Enter your search term")
 
78
 
79
  keyword_search_button = gr.Button(value="Search text")
80
 
 
113
  with gr.Row():
114
  in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
115
  return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
116
+ embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
117
  #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
118
  with gr.Accordion(label="Keyword search options", open = False):
119
  with gr.Row():
 
131
  with gr.Row():
132
  in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
133
  with gr.Accordion(label="Semantic search options", open = False):
134
+ semantic_min_distance = gr.Slider(label = "Minimum distance score for search result to be included", value = 0.75, minimum=0, maximum=0.95, step=0.01)
135
  with gr.Accordion(label = "Join on additional dataframes to results", open = False):
136
  in_join_file = gr.File(label="Upload your data to join here")
137
+ in_join_message = gr.Textbox(label="Join file load progress")
138
  in_join_column = gr.Dropdown(label="Column to join in new data frame")
139
  search_df_join_column = gr.Dropdown(label="Column to join in search data frame")
140
 
141
+ in_search_param_button.click(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message])
142
 
143
  # ---
144
  in_k1_button.click(display_info, inputs=in_k1_info)
 
148
 
149
  ### BM25 SEARCH ###
150
  # Update dropdowns upon initial file load
151
+ in_bm25_file.upload(put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, search_df_join_column, keyword_data_state, search_index_state, embeddings_state, load_finished_message])
152
+ in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
153
 
154
  # Load in BM25 data
155
  load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, current_source]).\
156
+ then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, in_bm25_column, search_index_state, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
157
+
 
158
  # BM25 search functions on click or enter
159
+ keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
160
+ keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, join_data_state, in_clean_data, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
161
 
162
  ### SEMANTIC SEARCH ###
163
  # Load in a csv/excel file for semantic search
164
+ in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, search_df_join_column, semantic_data_state, search_index_state, embeddings_state, semantic_load_progress])
165
  load_semantic_data_button.click(parse_csv_or_excel, inputs=[in_semantic_file, semantic_data_state, in_semantic_column], outputs=[ingest_text, current_source_semantic, semantic_load_progress]).\
166
  then(csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
167
+ then(docs_to_jina_embed_np_array, inputs=[ingest_docs, in_semantic_file, embeddings_state, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])
168
 
169
  # Semantic search query
170
+ semantic_submit.click(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic")
171
+ semantic_query.submit(jina_simple_retrieval, inputs=[semantic_query, vectorstore_state, ingest_docs, in_semantic_column, k_val, out_passages, semantic_min_distance, vec_weight, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
172
 
173
  # Dummy functions just to get dropdowns to work correctly with Gradio 3.50
174
  in_bm25_column.change(dummy_function, in_bm25_column, None)
how_to_create_exe_dist.txt CHANGED
@@ -17,10 +17,10 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
17
  8. In command line, cd to the folder that contains app.py. Then run the following:
18
 
19
  For one single file:
20
- python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.2.1 app.py
21
 
22
  For a small exe with a folder of dependencies:
23
- python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.2.1 app.py
24
 
25
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
26
 
 
17
  8. In command line, cd to the folder that contains app.py. Then run the following:
18
 
19
  For one single file:
20
+ python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.2.2 app.py
21
 
22
  For a small exe with a folder of dependencies:
23
+ python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.2.2 app.py
24
 
25
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
26
 
requirements.txt CHANGED
@@ -7,4 +7,4 @@ accelerate==0.26.0
7
  torch==2.1.2
8
  spacy==3.7.2
9
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
10
- gradio==3.50.0
 
7
  torch==2.1.2
8
  spacy==3.7.2
9
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
10
+ gradio==4.16.0
search_funcs/bm25_functions.py CHANGED
@@ -18,7 +18,7 @@ from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sent
18
  from search_funcs.helper_functions import read_file, get_file_path_end_with_ext, get_file_path_end
19
 
20
  # Load the SpaCy model
21
- from spacy.cli import download
22
  import spacy
23
  spacy.prefer_gpu()
24
 
@@ -231,13 +231,25 @@ class BM25:
231
 
232
  # These following functions are my own work
233
 
234
- def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", return_intermediate_files = "No", progress=gr.Progress()):
 
235
 
 
 
 
 
 
236
  file_list = [string.name for string in in_file]
237
 
238
  #print(file_list)
239
 
240
- data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
 
 
 
 
 
 
241
 
242
  data_file_name = data_file_names[0]
243
 
@@ -263,6 +275,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
263
  tokenised_df = read_file(tokenised_file_names[0])
264
 
265
  if clean == "Yes":
 
266
  clean_tic = time.perf_counter()
267
  print("Starting data clean.")
268
 
@@ -280,14 +293,16 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
280
  else:
281
  # Don't clean or save file to disk
282
  df_list = list(df[text_column])
283
- print("No data cleaning performed.")
284
  out_file_name = None
285
 
286
  # Tokenise data. If tokenised df already exists, no need to do anything
287
 
 
 
288
  if not tokenised_df.empty:
289
  corpus = tokenised_df.iloc[:,0].tolist()
290
- print("Tokeniser loaded from file.")
291
  #print("Corpus is: ", corpus[0:5])
292
 
293
  # If doesn't already exist, tokenize texts in batches
@@ -316,7 +331,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", retur
316
 
317
  return corpus, message, df, out_file_name, None, data_file_out_name # tokenised_data_file_name
318
 
319
- def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column):
320
 
321
  # Check if the list and the dataframe have the same length
322
  if len(prepared_text_list) != len(in_df):
@@ -342,31 +357,55 @@ def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_col
342
 
343
  return file_name, new_text_column
344
 
345
- def prepare_bm25(corpus, in_file, return_intermediate_files, k1=1.5, b = 0.75, alpha=-5):
346
  #bm25.save("saved_df_bm25")
347
  #bm25 = BM25.load(re.sub(r'\.pkl$', '', file_in.name))
348
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  file_list = [string.name for string in in_file]
350
 
351
  #print(file_list)
352
 
353
  # Get data file name
354
- data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
 
 
 
355
 
356
  data_file_name = data_file_names[0]
357
  data_file_out_name = get_file_path_end_with_ext(data_file_name)
358
  data_file_name_no_ext = get_file_path_end(data_file_name)
359
 
360
  # Check if there is a search index file already
361
- index_file_names = [string.lower() for string in file_list if "gz" in string.lower()]
362
 
 
363
 
364
- if index_file_names:
365
- index_file_name = index_file_names[0]
 
366
 
367
- print(index_file_name)
368
 
369
- bm25_load = read_file(index_file_name)
370
 
371
 
372
  #index_file_out_name = get_file_path_end_with_ext(index_file_name)
@@ -381,6 +420,8 @@ def prepare_bm25(corpus, in_file, return_intermediate_files, k1=1.5, b = 0.75, a
381
  bm25 = bm25_load
382
 
383
  if return_intermediate_files == "Yes":
 
 
384
  bm25_search_file_name = data_file_name_no_ext + '_' + 'search_index.pkl.gz'
385
  #np.savez_compressed(bm25_search_file_name, bm25)
386
 
@@ -420,8 +461,10 @@ def convert_bm25_query_to_tokens(free_text_query, clean="No"):
420
 
421
  return out_query
422
 
423
- def bm25_search(free_text_query, in_no_search_results, original_data, text_column, clean = "No", in_join_file = None, in_join_column = "", search_df_join_column = ""):
424
 
 
 
425
  # Prepare query
426
  if (clean == "Yes") | (text_column.endswith("_cleaned")):
427
  token_query = convert_bm25_query_to_tokens(free_text_query, clean="Yes")
@@ -435,7 +478,7 @@ def bm25_search(free_text_query, in_no_search_results, original_data, text_colum
435
 
436
  results_index, results_text, results_scores = bm25.extract_documents_and_scores(token_query, bm25.corpus, n=in_no_search_results) #bm25.corpus #original_data[text_column]
437
  if not results_index:
438
- return "No search results found", None, token_query
439
 
440
  print("Search complete")
441
 
@@ -448,18 +491,16 @@ def bm25_search(free_text_query, in_no_search_results, original_data, text_colum
448
  results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")#.drop("index", axis=1)
449
 
450
  # Join on additional files
451
- if in_join_file:
452
- join_filename = in_join_file.name
453
-
454
- # Import data
455
- join_df = read_file(join_filename)
456
  join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
457
  results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
458
 
459
  # Duplicates dropped so as not to expand out dataframe
460
  join_df = join_df.drop_duplicates(in_join_column)
461
 
462
- results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1)
463
 
464
  # Reorder results by score
465
  results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
@@ -467,9 +508,13 @@ def bm25_search(free_text_query, in_no_search_results, original_data, text_colum
467
  # Out file
468
  query_str_file = ("_").join(token_query)
469
  results_df_name = "keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
 
 
 
 
470
  results_df_out.to_excel(results_df_name, index= None)
471
  results_first_text = results_df_out[text_column].iloc[0]
472
 
473
  print("Returning results")
474
 
475
- return results_first_text, results_df_name, token_query
 
18
  from search_funcs.helper_functions import read_file, get_file_path_end_with_ext, get_file_path_end
19
 
20
  # Load the SpaCy model
21
+ from spacy.cli.download import download
22
  import spacy
23
  spacy.prefer_gpu()
24
 
 
231
 
232
  # These following functions are my own work
233
 
234
+ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No", return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
235
+ print(in_file)
236
 
237
+ if not in_file:
238
+ print("No input file found. Please load in at least one file.")
239
+ return None, "No input file found. Please load in at least one file.", data_state, None, None, None
240
+
241
+ progress(0, desc = "Loading in data")
242
  file_list = [string.name for string in in_file]
243
 
244
  #print(file_list)
245
 
246
+ data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "gz" not in string.lower()]
247
+
248
+ if not data_file_names:
249
+ return None, "Please load in at least one csv/Excel/parquet data file.", data_state, None, None, None
250
+
251
+ if not text_column:
252
+ return None, "Please enter a column name to search.", data_state, None, None, None
253
 
254
  data_file_name = data_file_names[0]
255
 
 
275
  tokenised_df = read_file(tokenised_file_names[0])
276
 
277
  if clean == "Yes":
278
+ progress(0.1, desc = "Cleaning data")
279
  clean_tic = time.perf_counter()
280
  print("Starting data clean.")
281
 
 
293
  else:
294
  # Don't clean or save file to disk
295
  df_list = list(df[text_column])
296
+ print("No data cleaning performed")
297
  out_file_name = None
298
 
299
  # Tokenise data. If tokenised df already exists, no need to do anything
300
 
301
+ progress(0.4, desc = "Tokenising text")
302
+
303
  if not tokenised_df.empty:
304
  corpus = tokenised_df.iloc[:,0].tolist()
305
+ print("Tokeniser loaded from file")
306
  #print("Corpus is: ", corpus[0:5])
307
 
308
  # If doesn't already exist, tokenize texts in batches
 
331
 
332
  return corpus, message, df, out_file_name, None, data_file_out_name # tokenised_data_file_name
333
 
334
+ def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
335
 
336
  # Check if the list and the dataframe have the same length
337
  if len(prepared_text_list) != len(in_df):
 
357
 
358
  return file_name, new_text_column
359
 
360
+ def prepare_bm25(corpus, in_file, text_column, search_index, return_intermediate_files, k1=1.5, b = 0.75, alpha=-5, progress=gr.Progress(track_tqdm=True)):
361
  #bm25.save("saved_df_bm25")
362
  #bm25 = BM25.load(re.sub(r'\.pkl$', '', file_in.name))
363
 
364
+
365
+
366
+ if not in_file:
367
+ out_message ="No input file found. Please load in at least one file."
368
+ print(out_message)
369
+ return out_message, None
370
+
371
+ if not corpus:
372
+ out_message = "No data file found. Please load in at least one csv/Excel/Parquet file."
373
+ print(out_message)
374
+ return out_message, None
375
+
376
+ if not text_column:
377
+ out_message = "Please enter a column name to search."
378
+ print(out_message)
379
+ return out_message, None
380
+
381
+
382
+
383
  file_list = [string.name for string in in_file]
384
 
385
  #print(file_list)
386
 
387
  # Get data file name
388
+ data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "gz" not in string.lower()]
389
+
390
+ if not data_file_names:
391
+ return "Please load in at least one csv/Excel/parquet data file.", None
392
 
393
  data_file_name = data_file_names[0]
394
  data_file_out_name = get_file_path_end_with_ext(data_file_name)
395
  data_file_name_no_ext = get_file_path_end(data_file_name)
396
 
397
  # Check if there is a search index file already
398
+ #index_file_names = [string.lower() for string in file_list if "gz" in string.lower()]
399
 
400
+ progress(0.6, desc = "Preparing search index")
401
 
402
+ #if index_file_names:
403
+ if search_index:
404
+ #index_file_name = index_file_names[0]
405
 
406
+ #print(index_file_name)
407
 
408
+ bm25_load = search_index
409
 
410
 
411
  #index_file_out_name = get_file_path_end_with_ext(index_file_name)
 
420
  bm25 = bm25_load
421
 
422
  if return_intermediate_files == "Yes":
423
+ print("Saving search index file")
424
+ progress(0.8, desc = "Saving search index to file")
425
  bm25_search_file_name = data_file_name_no_ext + '_' + 'search_index.pkl.gz'
426
  #np.savez_compressed(bm25_search_file_name, bm25)
427
 
 
461
 
462
  return out_query
463
 
464
+ def bm25_search(free_text_query, in_no_search_results, original_data, text_column, in_join_file, clean = "No", in_join_column = "", search_df_join_column = "", progress=gr.Progress(track_tqdm=True)):
465
 
466
+ progress(0, desc = "Conducting keyword search")
467
+
468
  # Prepare query
469
  if (clean == "Yes") | (text_column.endswith("_cleaned")):
470
  token_query = convert_bm25_query_to_tokens(free_text_query, clean="Yes")
 
478
 
479
  results_index, results_text, results_scores = bm25.extract_documents_and_scores(token_query, bm25.corpus, n=in_no_search_results) #bm25.corpus #original_data[text_column]
480
  if not results_index:
481
+ return "No search results found", None
482
 
483
  print("Search complete")
484
 
 
491
  results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")#.drop("index", axis=1)
492
 
493
  # Join on additional files
494
+ if not in_join_file.empty:
495
+ progress(0.5, desc = "Joining on additional data file")
496
+ join_df = in_join_file
 
 
497
  join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
498
  results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
499
 
500
  # Duplicates dropped so as not to expand out dataframe
501
  join_df = join_df.drop_duplicates(in_join_column)
502
 
503
+ results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left")#.drop(in_join_column, axis=1)
504
 
505
  # Reorder results by score
506
  results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
 
508
  # Out file
509
  query_str_file = ("_").join(token_query)
510
  results_df_name = "keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
511
+
512
+ print("Saving search file output")
513
+ progress(0.7, desc = "Saving search output to file")
514
+
515
  results_df_out.to_excel(results_df_name, index= None)
516
  results_first_text = results_df_out[text_column].iloc[0]
517
 
518
  print("Returning results")
519
 
520
+ return results_first_text, results_df_name
search_funcs/helper_functions.py CHANGED
@@ -7,6 +7,7 @@ import shutil
7
  import getpass
8
  import gzip
9
  import pickle
 
10
 
11
  # Attempt to delete content of gradio temp folder
12
  def get_temp_folder_path():
@@ -89,19 +90,27 @@ def read_file(filename):
89
 
90
  def put_columns_in_df(in_file, in_bm25_column):
91
  '''
92
- When file is loaded, update the column dropdown choices and change 'clean data' dropdown option to 'no'.
93
  '''
 
 
 
 
 
94
 
95
  file_list = [string.name for string in in_file]
96
 
97
  #print(file_list)
98
 
99
- data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
100
- data_file_name = data_file_names[0]
101
 
102
- new_choices = []
103
- concat_choices = []
104
-
 
 
 
 
105
 
106
  df = read_file(data_file_name)
107
 
@@ -109,32 +118,60 @@ def put_columns_in_df(in_file, in_bm25_column):
109
 
110
  new_choices = list(df.columns)
111
 
 
 
 
 
112
  else: new_choices = ["page_contents"] + list(df[0].metadata.keys()) #["Documents"]
113
  #print(new_choices)
114
 
115
- concat_choices.extend(new_choices)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
- return gr.Dropdown(choices=concat_choices), gr.Dropdown(value="No", choices = ["Yes", "No"]), gr.Dropdown(choices=concat_choices), df
118
 
119
- def put_columns_in_join_df(in_file, in_bm25_column):
120
  '''
121
- When file is loaded, update the column dropdown choices and change 'clean data' dropdown option to 'no'.
122
  '''
123
-
124
- print("in_bm25_column")
125
 
126
  new_choices = []
127
  concat_choices = []
128
 
129
 
130
- df = read_file(in_file.name)
131
- new_choices = list(df.columns)
 
 
132
 
133
- print(new_choices)
134
 
135
- concat_choices.extend(new_choices)
136
 
137
- return gr.Dropdown(choices=concat_choices)
138
 
139
  def dummy_function(gradio_component):
140
  """
 
7
  import getpass
8
  import gzip
9
  import pickle
10
+ import numpy as np
11
 
12
  # Attempt to delete content of gradio temp folder
13
  def get_temp_folder_path():
 
90
 
91
  def put_columns_in_df(in_file, in_bm25_column):
92
  '''
93
+ When file is loaded, update the column dropdown choices
94
  '''
95
+ new_choices = []
96
+ concat_choices = []
97
+ index_load = None
98
+ embed_load = np.array([])
99
+ out_message = ""
100
 
101
  file_list = [string.name for string in in_file]
102
 
103
  #print(file_list)
104
 
105
+ data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "search_index" not in string.lower()]
 
106
 
107
+ if not data_file_names:
108
+ out_message = "Please load in at least one csv/Excel/parquet data file."
109
+ print(out_message)
110
+ return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), pd.DataFrame(), bm25_load, out_message
111
+
112
+ data_file_name = data_file_names[0]
113
+
114
 
115
  df = read_file(data_file_name)
116
 
 
118
 
119
  new_choices = list(df.columns)
120
 
121
+ elif "search_index" in data_file_name:
122
+ # If only the search_index found, need a data file too
123
+ new_choices = []
124
+
125
  else: new_choices = ["page_contents"] + list(df[0].metadata.keys()) #["Documents"]
126
  #print(new_choices)
127
 
128
+ concat_choices.extend(new_choices)
129
+
130
+ # Check if there is a search index file already
131
+ index_file_names = [string.lower() for string in file_list if "gz" in string.lower()]
132
+
133
+ if index_file_names:
134
+ index_file_name = index_file_names[0]
135
+ index_load = read_file(index_file_name)
136
+
137
+ embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
138
+
139
+ if embeddings_file_names:
140
+ print("Loading embeddings from file.")
141
+ embed_load = np.load(embeddings_file_names[0])['arr_0']
142
+
143
+ # If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
144
+ if "compress" in embeddings_file_names[0]:
145
+ embed_load /= 100
146
+ else:
147
+ embed_load = np.array([])
148
+
149
+ out_message = "Initial data check successful. Next, choose a data column to search in the drop down above, then click 'Load data'"
150
+ print(out_message)
151
 
152
+ return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, index_load, embed_load, out_message
153
 
154
+ def put_columns_in_join_df(in_file):
155
  '''
156
+ When file is loaded, update the column dropdown choices
157
  '''
158
+ new_df = pd.DataFrame()
159
+ #print("in_bm25_column")
160
 
161
  new_choices = []
162
  concat_choices = []
163
 
164
 
165
+ new_df = read_file(in_file.name)
166
+ new_choices = list(new_df.columns)
167
+
168
+ #print(new_choices)
169
 
170
+ concat_choices.extend(new_choices)
171
 
172
+ out_message = "File load successful. Now select a column to join below."
173
 
174
+ return gr.Dropdown(choices=concat_choices), new_df, out_message
175
 
176
  def dummy_function(gradio_component):
177
  """
search_funcs/semantic_functions.py CHANGED
@@ -12,7 +12,6 @@ today_rev = datetime.now().strftime("%Y%m%d")
12
  from transformers import AutoModel
13
 
14
  from torch import cuda, backends, tensor, mm
15
- from search_funcs.helper_functions import read_file
16
 
17
  # Check for torch cuda
18
  print("Is CUDA enabled? ", cuda.is_available())
@@ -43,18 +42,6 @@ except:
43
  embeddings_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
44
 
45
 
46
- # Chroma support is currently deprecated
47
- # Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
48
- #import chromadb
49
- #from chromadb.config import Settings
50
- #from typing_extensions import Protocol
51
- #from chromadb import Documents, EmbeddingFunction, Embeddings
52
-
53
- # Remove Chroma database file. If it exists as it can cause issues
54
- #chromadb_file = "chroma.sqlite3"
55
-
56
- #if os.path.isfile(chromadb_file):
57
- # os.remove(chromadb_file)
58
  def get_file_path_end(file_path):
59
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
60
  basename = os.path.basename(file_path)
@@ -82,10 +69,17 @@ def load_embeddings(embeddings_name = embeddings_name):
82
 
83
  return embeddings
84
 
85
- def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings = embeddings_model, progress=gr.Progress()):
86
  '''
87
  Takes a Langchain document class and saves it into a Chroma sqlite file.
88
  '''
 
 
 
 
 
 
 
89
 
90
  print(f"> Total split documents: {len(docs_out)}")
91
 
@@ -105,17 +99,9 @@ def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "
105
 
106
  out_message = "Document processing complete. Ready to search."
107
 
108
- if embeddings_file_names:
109
- print("Loading embeddings from file.")
110
- embeddings_out = np.load(embeddings_file_names[0])['arr_0']
111
-
112
- # If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
113
- if "compress" in embeddings_file_names[0]:
114
- embeddings_out /= 100
115
 
116
- # print("embeddings loaded: ", embeddings_out)
117
-
118
- if not embeddings_file_names:
119
  tic = time.perf_counter()
120
  print("Starting to embed documents.")
121
  #embeddings_list = []
@@ -132,6 +118,7 @@ def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "
132
 
133
  # If you want to save your files for next time
134
  if return_intermediate_files == "Yes":
 
135
  if embeddings_super_compress == "No":
136
  semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
137
  np.savez_compressed(semantic_search_file_name, embeddings_out)
@@ -144,12 +131,15 @@ def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "
144
  return out_message, embeddings_out, semantic_search_file_name
145
 
146
  return out_message, embeddings_out, None
 
 
 
147
 
148
  print(out_message)
149
 
150
  return out_message, embeddings_out, None#, None
151
 
152
- def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column):
153
 
154
  def create_docs_keep_from_df(df):
155
  dict_out = {'ids' : [df['ids']],
@@ -213,11 +203,10 @@ def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_c
213
  # results_df_out = orig_df.merge(length_more_limit[['ids', 'distances']], left_index = True, right_on = "ids", how="inner").sort_values("distances")
214
 
215
  # Join on additional files
216
- if in_join_file:
217
- join_filename = in_join_file.name
 
218
 
219
- # Import data
220
- join_df = read_file(join_filename)
221
  join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
222
 
223
  # Duplicates dropped so as not to expand out dataframe
@@ -225,14 +214,17 @@ def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_c
225
 
226
  results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
227
 
228
- results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1)
229
 
230
  return results_df_out
231
 
232
  def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
233
- vec_score_cut_off:float, vec_weight:float, in_join_file = None, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress()): # ,vectorstore, embeddings
234
 
235
  # print("vectorstore loaded: ", vectorstore)
 
 
 
236
 
237
  # Convert it to a PyTorch tensor and transfer to GPU
238
  vectorstore_tensor = tensor(vectorstore).to(device)
@@ -277,6 +269,8 @@ def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_v
277
 
278
  results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
279
 
 
 
280
  # If nothing found, return error message
281
  if results_df_out.empty:
282
  return 'No result found!', None
@@ -284,12 +278,30 @@ def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_v
284
  query_str_file = query_str.replace(" ", "_")
285
 
286
  results_df_name = "semantic_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
 
 
 
 
287
  results_df_out.to_excel(results_df_name, index= None)
288
  results_first_text = results_df_out.iloc[0, 1]
289
 
 
 
290
  return results_first_text, results_df_name
291
 
292
  # Deprecated Chroma functions - kept just in case needed in future.
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
  def docs_to_chroma_save_deprecated(docs_out, embeddings = embeddings_model, progress=gr.Progress()):
295
  '''
 
12
  from transformers import AutoModel
13
 
14
  from torch import cuda, backends, tensor, mm
 
15
 
16
  # Check for torch cuda
17
  print("Is CUDA enabled? ", cuda.is_available())
 
42
  embeddings_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
43
 
44
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  def get_file_path_end(file_path):
46
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
47
  basename = os.path.basename(file_path)
 
69
 
70
  return embeddings
71
 
72
+ def docs_to_jina_embed_np_array(docs_out, in_file, embeddings_state, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)):
73
  '''
74
  Takes a Langchain document class and saves it into a Chroma sqlite file.
75
  '''
76
+ if not in_file:
77
+ out_message = "No input file found. Please load in at least one file."
78
+ print(out_message)
79
+ return out_message, None, None
80
+
81
+
82
+ progress(0.7, desc = "Loading/creating embeddings")
83
 
84
  print(f"> Total split documents: {len(docs_out)}")
85
 
 
99
 
100
  out_message = "Document processing complete. Ready to search."
101
 
102
+ # print("embeddings loaded: ", embeddings_out)
 
 
 
 
 
 
103
 
104
+ if embeddings_state.size == 0:
 
 
105
  tic = time.perf_counter()
106
  print("Starting to embed documents.")
107
  #embeddings_list = []
 
118
 
119
  # If you want to save your files for next time
120
  if return_intermediate_files == "Yes":
121
+ progress(0.9, desc = "Saving embeddings to file")
122
  if embeddings_super_compress == "No":
123
  semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
124
  np.savez_compressed(semantic_search_file_name, embeddings_out)
 
131
  return out_message, embeddings_out, semantic_search_file_name
132
 
133
  return out_message, embeddings_out, None
134
+ else:
135
+ # Just return existing embeddings if already exist
136
+ embeddings_out = embeddings_state
137
 
138
  print(out_message)
139
 
140
  return out_message, embeddings_out, None#, None
141
 
142
+ def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column, progress = gr.Progress(track_tqdm=True)):
143
 
144
  def create_docs_keep_from_df(df):
145
  dict_out = {'ids' : [df['ids']],
 
203
  # results_df_out = orig_df.merge(length_more_limit[['ids', 'distances']], left_index = True, right_on = "ids", how="inner").sort_values("distances")
204
 
205
  # Join on additional files
206
+ if not in_join_file.empty:
207
+ progress(0.5, desc = "Joining on additional data file")
208
+ join_df = in_join_file
209
 
 
 
210
  join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
211
 
212
  # Duplicates dropped so as not to expand out dataframe
 
214
 
215
  results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
216
 
217
+ results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left")#.drop(in_join_column, axis=1)
218
 
219
  return results_df_out
220
 
221
  def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
222
+ vec_score_cut_off:float, vec_weight:float, in_join_file, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress(track_tqdm=True)): # ,vectorstore, embeddings
223
 
224
  # print("vectorstore loaded: ", vectorstore)
225
+ progress(0, desc = "Conducting semantic search")
226
+
227
+ print("Searching")
228
 
229
  # Convert it to a PyTorch tensor and transfer to GPU
230
  vectorstore_tensor = tensor(vectorstore).to(device)
 
269
 
270
  results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
271
 
272
+ print("Search complete")
273
+
274
  # If nothing found, return error message
275
  if results_df_out.empty:
276
  return 'No result found!', None
 
278
  query_str_file = query_str.replace(" ", "_")
279
 
280
  results_df_name = "semantic_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
281
+
282
+ print("Saving search output to file")
283
+ progress(0.7, desc = "Saving search output to file")
284
+
285
  results_df_out.to_excel(results_df_name, index= None)
286
  results_first_text = results_df_out.iloc[0, 1]
287
 
288
+ print("Returning results")
289
+
290
  return results_first_text, results_df_name
291
 
292
  # Deprecated Chroma functions - kept just in case needed in future.
293
+ # Chroma support is currently deprecated
294
+ # Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
295
+ #import chromadb
296
+ #from chromadb.config import Settings
297
+ #from typing_extensions import Protocol
298
+ #from chromadb import Documents, EmbeddingFunction, Embeddings
299
+
300
+ # Remove Chroma database file. If it exists as it can cause issues
301
+ #chromadb_file = "chroma.sqlite3"
302
+
303
+ #if os.path.isfile(chromadb_file):
304
+ # os.remove(chromadb_file)
305
 
306
  def docs_to_chroma_save_deprecated(docs_out, embeddings = embeddings_model, progress=gr.Progress()):
307
  '''
search_funcs/semantic_ingest_functions.py CHANGED
@@ -294,12 +294,23 @@ def parse_metadata(row):
294
 
295
  # return doc_sections, message
296
 
297
- def csv_excel_text_to_docs(df, in_file, text_column='text', clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress()) -> List[Document]:
298
  """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
 
 
 
 
299
 
300
  file_list = [string.name for string in in_file]
301
 
302
  data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
 
 
 
 
 
 
 
303
  data_file_name = data_file_names[0]
304
 
305
  # Check if file is a document format, and explode out as needed
@@ -326,6 +337,7 @@ def csv_excel_text_to_docs(df, in_file, text_column='text', clean = "No", return
326
  df[text_column] = df[text_column].astype(str).str.strip() # Ensure column is a string column
327
 
328
  if clean == "Yes":
 
329
  clean_tic = time.perf_counter()
330
  print("Starting data clean.")
331
 
@@ -352,6 +364,7 @@ def csv_excel_text_to_docs(df, in_file, text_column='text', clean = "No", return
352
  #doc_sections = df[["page_content", "metadata"]].to_dict(orient='records')
353
  #doc_sections = [Document(**row) for row in df[["page_content", "metadata"]].to_dict(orient='records')]
354
 
 
355
 
356
  # Create a list of Document objects
357
  doc_sections = [Document(page_content=row['page_content'],
@@ -364,6 +377,7 @@ def csv_excel_text_to_docs(df, in_file, text_column='text', clean = "No", return
364
  print(ingest_time_out)
365
 
366
  if return_intermediate_files == "Yes":
 
367
  data_file_out_name_no_ext = get_file_path_end(data_file_name)
368
  file_name = data_file_out_name_no_ext
369
  #print(doc_sections)
 
294
 
295
  # return doc_sections, message
296
 
297
+ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
298
  """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
299
+ if not in_file:
300
+ return None, "Please load in at least one file.", data_state, None, None, None
301
+
302
+ progress(0, desc = "Loading in data")
303
 
304
  file_list = [string.name for string in in_file]
305
 
306
  data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
307
+
308
+ if not data_file_names:
309
+ return doc_sections, "Please load in at least one csv/Excel/parquet data file."
310
+
311
+ if not text_column:
312
+ return None, "Please enter a column name to search", data_state, None, None, None
313
+
314
  data_file_name = data_file_names[0]
315
 
316
  # Check if file is a document format, and explode out as needed
 
337
  df[text_column] = df[text_column].astype(str).str.strip() # Ensure column is a string column
338
 
339
  if clean == "Yes":
340
+ progress(0.1, desc = "Cleaning data")
341
  clean_tic = time.perf_counter()
342
  print("Starting data clean.")
343
 
 
364
  #doc_sections = df[["page_content", "metadata"]].to_dict(orient='records')
365
  #doc_sections = [Document(**row) for row in df[["page_content", "metadata"]].to_dict(orient='records')]
366
 
367
+ progress(0.3, desc = "Converting data to document format")
368
 
369
  # Create a list of Document objects
370
  doc_sections = [Document(page_content=row['page_content'],
 
377
  print(ingest_time_out)
378
 
379
  if return_intermediate_files == "Yes":
380
+ progress(0.5, desc = "Saving prepared documents")
381
  data_file_out_name_no_ext = get_file_path_end(data_file_name)
382
  file_name = data_file_out_name_no_ext
383
  #print(doc_sections)