seanpedrickcase commited on
Commit
2089141
β€’
1 Parent(s): 8466e45

Now checks for output folder before saving. Minor code cleaning

Browse files
Dockerfile CHANGED
@@ -14,7 +14,7 @@ COPY requirements.txt .
14
  RUN pip install --no-cache-dir -r requirements.txt
15
 
16
  # Gradio needs to be installed after due to conflict with spacy in requirements
17
- RUN pip install --no-cache-dir gradio==4.31.0
18
 
19
  # Download the BGE embedding model during the build process
20
  RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
 
14
  RUN pip install --no-cache-dir -r requirements.txt
15
 
16
  # Gradio needs to be installed after due to conflict with spacy in requirements
17
+ RUN pip install --no-cache-dir gradio==4.31.4
18
 
19
  # Download the BGE embedding model during the build process
20
  RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: πŸ”
4
  colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 4.31.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
4
  colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.31.4
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py CHANGED
@@ -129,7 +129,7 @@ depends on factors such as the type of documents or queries. Information taken f
129
  with gr.Accordion(label="Data load / save options", open = True):
130
  with gr.Row():
131
  in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
132
- return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="False", choices=["Yes", "No"])
133
  embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
134
  #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
135
  with gr.Accordion(label="Keyword search options", open = False):
@@ -156,6 +156,7 @@ depends on factors such as the type of documents or queries. Information taken f
156
  in_join_message = gr.Textbox(label="Join file load progress")
157
  in_join_column = gr.Dropdown(label="Column to join in new data frame")
158
  search_df_join_column = gr.Dropdown(label="Column to join in search data frame")
 
159
  with gr.Accordion(label = "AWS data access", open = False):
160
  aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
161
  with gr.Row():
 
129
  with gr.Accordion(label="Data load / save options", open = True):
130
  with gr.Row():
131
  in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
132
+ return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
133
  embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
134
  #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
135
  with gr.Accordion(label="Keyword search options", open = False):
 
156
  in_join_message = gr.Textbox(label="Join file load progress")
157
  in_join_column = gr.Dropdown(label="Column to join in new data frame")
158
  search_df_join_column = gr.Dropdown(label="Column to join in search data frame")
159
+
160
  with gr.Accordion(label = "AWS data access", open = False):
161
  aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
162
  with gr.Row():
how_to_create_exe_dist.txt CHANGED
@@ -14,7 +14,7 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
14
 
15
  9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
16
 
17
- a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --onefile --name DataSearchApp_0.3 app.py
18
 
19
  b) Open the created spec file in Notepad. Add the following to the end of the Analysis section then save:
20
 
@@ -25,7 +25,7 @@ a = Analysis(
25
  }
26
  )
27
 
28
- c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.3.spec
29
 
30
 
31
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
 
14
 
15
  9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
16
 
17
+ a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --onefile --name DataSearchApp_0.4 app.py
18
 
19
  b) Open the created spec file in Notepad. Add the following to the end of the Analysis section then save:
20
 
 
25
  }
26
  )
27
 
28
+ c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.4.spec
29
 
30
 
31
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
search_funcs/aws_functions.py CHANGED
@@ -7,8 +7,8 @@ import os
7
  PandasDataFrame = Type[pd.DataFrame]
8
 
9
  try:
 
10
  bucket_name = os.environ['DATA_TEXT_SEARCH_BUCKET']
11
- session = boto3.Session(profile_name="default")
12
  except Exception as e:
13
  bucket_name = ''
14
  print(e)
 
7
  PandasDataFrame = Type[pd.DataFrame]
8
 
9
  try:
10
+ session = boto3.Session()
11
  bucket_name = os.environ['DATA_TEXT_SEARCH_BUCKET']
 
12
  except Exception as e:
13
  bucket_name = ''
14
  print(e)
search_funcs/bm25_functions.py CHANGED
@@ -14,7 +14,7 @@ from datetime import datetime
14
  today_rev = datetime.now().strftime("%Y%m%d")
15
 
16
  from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
17
- from search_funcs.helper_functions import get_file_path_end_with_ext, get_file_path_end, create_highlighted_excel_wb
18
 
19
  # Load the SpaCy model
20
  from spacy.cli.download import download
@@ -232,6 +232,7 @@ class BM25:
232
 
233
  def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, clean="No", return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
234
  #print(in_file)
 
235
 
236
  if not in_file:
237
  print("No input file found. Please load in at least one file.")
@@ -324,6 +325,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
324
  message = "Data loaded. Warning: dataset may be too short to get consistent search results."
325
 
326
  if return_intermediate_files == "Yes":
 
327
  if clean == "Yes":
328
  tokenised_data_file_name = "output/" + data_file_out_name_no_ext + "_cleaned_tokenised.parquet"
329
  else:
@@ -337,6 +339,8 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
337
 
338
  def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
339
 
 
 
340
  # Check if the list and the dataframe have the same length
341
  if len(prepared_text_list) != len(in_df):
342
  raise ValueError("The length of 'prepared_text_list' and 'in_df' must match.")
@@ -543,6 +547,8 @@ def bm25_search(free_text_query, in_no_search_results, original_data, searched_d
543
  results_df_out = results_df_out.sort_values(['search_score_abs', "search_text"], ascending=False)
544
 
545
  # Out file
 
 
546
  query_str_file = ("_").join(token_query)
547
  results_df_name = "output/keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
548
 
 
14
  today_rev = datetime.now().strftime("%Y%m%d")
15
 
16
  from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
17
+ from search_funcs.helper_functions import get_file_path_end_with_ext, get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists
18
 
19
  # Load the SpaCy model
20
  from spacy.cli.download import download
 
232
 
233
  def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, clean="No", return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
234
  #print(in_file)
235
+ ensure_output_folder_exists()
236
 
237
  if not in_file:
238
  print("No input file found. Please load in at least one file.")
 
325
  message = "Data loaded. Warning: dataset may be too short to get consistent search results."
326
 
327
  if return_intermediate_files == "Yes":
328
+
329
  if clean == "Yes":
330
  tokenised_data_file_name = "output/" + data_file_out_name_no_ext + "_cleaned_tokenised.parquet"
331
  else:
 
339
 
340
  def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
341
 
342
+ ensure_output_folder_exists()
343
+
344
  # Check if the list and the dataframe have the same length
345
  if len(prepared_text_list) != len(in_df):
346
  raise ValueError("The length of 'prepared_text_list' and 'in_df' must match.")
 
547
  results_df_out = results_df_out.sort_values(['search_score_abs', "search_text"], ascending=False)
548
 
549
  # Out file
550
+ ensure_output_folder_exists()
551
+
552
  query_str_file = ("_").join(token_query)
553
  results_df_name = "output/keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
554
 
search_funcs/helper_functions.py CHANGED
@@ -58,6 +58,18 @@ def get_file_path_end_with_ext(file_path):
58
 
59
  return filename_end
60
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def detect_file_type(filename):
62
  """Detect the file type based on its extension."""
63
  if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
 
58
 
59
  return filename_end
60
 
61
+ def ensure_output_folder_exists():
62
+ """Checks if the 'output/' folder exists, creates it if not."""
63
+
64
+ folder_name = "output/"
65
+
66
+ if not os.path.exists(folder_name):
67
+ # Create the folder if it doesn't exist
68
+ os.makedirs(folder_name)
69
+ print(f"Created the 'output/' folder.")
70
+ else:
71
+ print(f"The 'output/' folder already exists.")
72
+
73
  def detect_file_type(filename):
74
  """Detect the file type based on its extension."""
75
  if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
search_funcs/semantic_functions.py CHANGED
@@ -25,7 +25,7 @@ else:
25
 
26
  print("Device used is: ", torch_device)
27
 
28
- from search_funcs.helper_functions import create_highlighted_excel_wb
29
 
30
  PandasDataFrame = Type[pd.DataFrame]
31
 
@@ -67,8 +67,11 @@ else:
67
 
68
  def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, output_file_state, clean, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings_model = embeddings_model, progress=gr.Progress(track_tqdm=True)):
69
  '''
70
- Takes a Langchain document class and saves it into a Chroma sqlite file.
71
  '''
 
 
 
72
  if not in_file:
73
  out_message = "No input file found. Please load in at least one file."
74
  print(out_message)
@@ -229,6 +232,8 @@ def bge_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_va
229
  # print("vectorstore loaded: ", vectorstore)
230
  progress(0, desc = "Conducting semantic search")
231
 
 
 
232
  print("Searching")
233
 
234
  # Convert it to a PyTorch tensor and transfer to GPU
 
25
 
26
  print("Device used is: ", torch_device)
27
 
28
+ from search_funcs.helper_functions import create_highlighted_excel_wb, ensure_output_folder_exists
29
 
30
  PandasDataFrame = Type[pd.DataFrame]
31
 
 
67
 
68
  def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, output_file_state, clean, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings_model = embeddings_model, progress=gr.Progress(track_tqdm=True)):
69
  '''
70
+ Takes a Langchain document class and saves it into a Numpy array.
71
  '''
72
+
73
+ ensure_output_folder_exists()
74
+
75
  if not in_file:
76
  out_message = "No input file found. Please load in at least one file."
77
  print(out_message)
 
232
  # print("vectorstore loaded: ", vectorstore)
233
  progress(0, desc = "Conducting semantic search")
234
 
235
+ ensure_output_folder_exists()
236
+
237
  print("Searching")
238
 
239
  # Convert it to a PyTorch tensor and transfer to GPU
search_funcs/semantic_ingest_functions.py CHANGED
@@ -31,7 +31,7 @@ chunk_size = 512
31
  chunk_overlap = 0
32
  start_index = True
33
 
34
- from search_funcs.helper_functions import get_file_path_end_with_ext, detect_file_type, get_file_path_end
35
  from search_funcs.bm25_functions import save_prepared_bm25_data
36
  from search_funcs.clean_funcs import initial_clean
37
 
@@ -198,6 +198,7 @@ def parse_metadata(row):
198
  def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
199
  """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
200
 
 
201
  output_list = []
202
 
203
  if not in_file:
 
31
  chunk_overlap = 0
32
  start_index = True
33
 
34
+ from search_funcs.helper_functions import get_file_path_end_with_ext, detect_file_type, get_file_path_end, ensure_output_folder_exists
35
  from search_funcs.bm25_functions import save_prepared_bm25_data
36
  from search_funcs.clean_funcs import initial_clean
37
 
 
198
  def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
199
  """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
200
 
201
+ ensure_output_folder_exists()
202
  output_list = []
203
 
204
  if not in_file: