seanpedrickcase commited on
Commit
8466e45
1 Parent(s): 7bdc986

Fixed cleaning for semantic search. Handles text with backslashes in (if cleaned). Updated packages. requirements file for only keyword search added.

Browse files
Dockerfile CHANGED
@@ -1,5 +1,5 @@
1
  # First stage: build dependencies
2
- FROM public.ecr.aws/docker/library/python:3.10.13-slim
3
 
4
  # Install wget
5
  RUN apt-get update && apt-get install -y wget
@@ -11,9 +11,12 @@ WORKDIR /src
11
 
12
  COPY requirements.txt .
13
 
14
- RUN pip install -r requirements.txt
15
 
16
- # Download the model during the build process
 
 
 
17
  RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
18
  RUN apt-get install git-lfs -y
19
  RUN git lfs install
@@ -21,10 +24,16 @@ RUN git clone https://huggingface.co/BAAI/bge-small-en-v1.5 /model/bge
21
  RUN rm -rf /model/bge/.git
22
 
23
  # Set up a new user named "user" with user ID 1000
24
- #RUN useradd -m -u 1000 user
 
 
 
 
 
 
25
 
26
  # Switch to the "user" user
27
- #USER user
28
 
29
  # Set home to the user's home directory
30
  ENV HOME=/home/user \
@@ -43,7 +52,7 @@ ENV HOME=/home/user \
43
  WORKDIR $HOME/app
44
 
45
  # Copy the current directory contents into the container at $HOME/app setting the owner to the user
46
- #COPY --chown=user . $HOME/app
47
- COPY . $HOME/app
48
 
49
  CMD ["python", "app.py"]
 
1
  # First stage: build dependencies
2
+ FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
3
 
4
  # Install wget
5
  RUN apt-get update && apt-get install -y wget
 
11
 
12
  COPY requirements.txt .
13
 
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
 
16
+ # Gradio needs to be installed after due to conflict with spacy in requirements
17
+ RUN pip install --no-cache-dir gradio==4.31.0
18
+
19
+ # Download the BGE embedding model during the build process
20
  RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
21
  RUN apt-get install git-lfs -y
22
  RUN git lfs install
 
24
  RUN rm -rf /model/bge/.git
25
 
26
  # Set up a new user named "user" with user ID 1000
27
+ RUN useradd -m -u 1000 user
28
+
29
+ # Change ownership of /home/user directory
30
+ RUN chown -R user:user /home/user
31
+
32
+ # Create the output files directory and set its permissions
33
+ RUN mkdir -p /home/user/output && chown -R user:user /home/user/output
34
 
35
  # Switch to the "user" user
36
+ USER user
37
 
38
  # Set home to the user's home directory
39
  ENV HOME=/home/user \
 
52
  WORKDIR $HOME/app
53
 
54
  # Copy the current directory contents into the container at $HOME/app setting the owner to the user
55
+ COPY --chown=user . $HOME/app
56
+ #COPY . $HOME/app
57
 
58
  CMD ["python", "app.py"]
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🔍
4
  colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 4.21.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
4
  colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.31.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py CHANGED
@@ -129,7 +129,7 @@ depends on factors such as the type of documents or queries. Information taken f
129
  with gr.Accordion(label="Data load / save options", open = True):
130
  with gr.Row():
131
  in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
132
- return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
133
  embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
134
  #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
135
  with gr.Accordion(label="Keyword search options", open = False):
 
129
  with gr.Accordion(label="Data load / save options", open = True):
130
  with gr.Row():
131
  in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
132
+ return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="False", choices=["Yes", "No"])
133
  embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
134
  #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
135
  with gr.Accordion(label="Keyword search options", open = False):
requirements.txt CHANGED
@@ -1,13 +1,11 @@
1
- pandas==2.2.0
2
  polars==0.20.3
3
  pyarrow==14.0.2
4
  openpyxl==3.1.2
5
- #transformers==4.37.2
6
- #accelerate==0.26.0
7
  torch==2.1.2
8
- spacy==3.7.2
9
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
10
- gradio==4.21.0
11
  sentence_transformers==2.3.1
12
  lxml==5.1.0
13
- boto3==1.34.63
 
1
+ pandas==2.2.2
2
  polars==0.20.3
3
  pyarrow==14.0.2
4
  openpyxl==3.1.2
 
 
5
  torch==2.1.2
6
+ spacy
7
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
8
+ gradio
9
  sentence_transformers==2.3.1
10
  lxml==5.1.0
11
+ boto3==1.34.103
requirements_no_semantic.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ pandas==2.2.2
2
+ polars==0.20.3
3
+ pyarrow==14.0.2
4
+ openpyxl==3.1.2
5
+ spacy
6
+ en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
7
+ gradio
8
+ lxml==5.1.0
9
+ boto3==1.34.103
search_funcs/aws_functions.py CHANGED
@@ -6,11 +6,11 @@ import os
6
 
7
  PandasDataFrame = Type[pd.DataFrame]
8
 
9
- bucket_name = os.environ['DATA_TEXT_SEARCH_BUCKET']
10
-
11
  try:
 
12
  session = boto3.Session(profile_name="default")
13
  except Exception as e:
 
14
  print(e)
15
 
16
  # sts = session.client("sts")
 
6
 
7
  PandasDataFrame = Type[pd.DataFrame]
8
 
 
 
9
  try:
10
+ bucket_name = os.environ['DATA_TEXT_SEARCH_BUCKET']
11
  session = boto3.Session(profile_name="default")
12
  except Exception as e:
13
+ bucket_name = ''
14
  print(e)
15
 
16
  # sts = session.client("sts")
search_funcs/bm25_functions.py CHANGED
@@ -325,9 +325,9 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
325
 
326
  if return_intermediate_files == "Yes":
327
  if clean == "Yes":
328
- tokenised_data_file_name = data_file_out_name_no_ext + "_cleaned_tokenised.parquet"
329
  else:
330
- tokenised_data_file_name = data_file_out_name_no_ext + "_tokenised.parquet"
331
 
332
  pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
333
 
@@ -354,9 +354,9 @@ def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_col
354
  prepared_df = pd.concat([in_df, prepared_text_df], axis = 1)
355
 
356
  if file_end == ".csv":
357
- prepared_df.to_csv(file_name)
358
  elif file_end == ".parquet":
359
- prepared_df.to_parquet(file_name)
360
  else: file_name = None
361
 
362
  return file_name, new_text_column, prepared_df
@@ -544,7 +544,7 @@ def bm25_search(free_text_query, in_no_search_results, original_data, searched_d
544
 
545
  # Out file
546
  query_str_file = ("_").join(token_query)
547
- results_df_name = "keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
548
 
549
  print("Saving search file output")
550
  progress(0.7, desc = "Saving search output to file")
 
325
 
326
  if return_intermediate_files == "Yes":
327
  if clean == "Yes":
328
+ tokenised_data_file_name = "output/" + data_file_out_name_no_ext + "_cleaned_tokenised.parquet"
329
  else:
330
+ tokenised_data_file_name = "output/" + data_file_out_name_no_ext + "_tokenised.parquet"
331
 
332
  pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
333
 
 
354
  prepared_df = pd.concat([in_df, prepared_text_df], axis = 1)
355
 
356
  if file_end == ".csv":
357
+ prepared_df.to_csv("output/" + file_name)
358
  elif file_end == ".parquet":
359
+ prepared_df.to_parquet("output/" + file_name)
360
  else: file_name = None
361
 
362
  return file_name, new_text_column, prepared_df
 
544
 
545
  # Out file
546
  query_str_file = ("_").join(token_query)
547
+ results_df_name = "output/keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
548
 
549
  print("Saving search file output")
550
  progress(0.7, desc = "Saving search output to file")
search_funcs/clean_funcs.py CHANGED
@@ -9,6 +9,8 @@ import calendar
9
  #from tqdm import tqdm
10
  import gradio as gr
11
 
 
 
12
  # Adding custom words to the stopwords
13
  custom_words = []
14
  my_stop_words = custom_words
@@ -24,6 +26,7 @@ custom_words.extend(cal_month)
24
 
25
 
26
  # #### Some of my cleaning functions
 
27
  email_start_pattern_regex = r'.*importance:|.*subject:'
28
  email_end_pattern_regex = r'kind regards.*|many thanks.*|sincerely.*'
29
  html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
@@ -45,10 +48,11 @@ multiple_spaces_regex = r'\s{2,}'
45
  # nbsp_pattern = re.compile(nbsp_pattern_regex)
46
 
47
 
48
- def initial_clean(texts , progress=gr.Progress()):
49
  texts = pl.Series(texts)#[]
50
 
51
- text = texts.str.replace_all(html_pattern_regex, '')
 
52
  text = text.str.replace_all(email_start_pattern_regex, '')
53
  text = text.str.replace_all(email_end_pattern_regex, '')
54
  text = text.str.replace_all(email_pattern_regex, '')
 
9
  #from tqdm import tqdm
10
  import gradio as gr
11
 
12
+ from typing import List
13
+
14
  # Adding custom words to the stopwords
15
  custom_words = []
16
  my_stop_words = custom_words
 
26
 
27
 
28
  # #### Some of my cleaning functions
29
+ replace_backslash = r'\\'
30
  email_start_pattern_regex = r'.*importance:|.*subject:'
31
  email_end_pattern_regex = r'kind regards.*|many thanks.*|sincerely.*'
32
  html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
 
48
  # nbsp_pattern = re.compile(nbsp_pattern_regex)
49
 
50
 
51
+ def initial_clean(texts:List[str] , progress=gr.Progress()):
52
  texts = pl.Series(texts)#[]
53
 
54
+ text = texts.str.replace_all(replace_backslash, '/')
55
+ text = text.str.replace_all(html_pattern_regex, '')
56
  text = text.str.replace_all(email_start_pattern_regex, '')
57
  text = text.str.replace_all(email_end_pattern_regex, '')
58
  text = text.str.replace_all(email_pattern_regex, '')
search_funcs/semantic_functions.py CHANGED
@@ -292,7 +292,7 @@ def bge_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_va
292
 
293
  query_str_file = query_str.replace(" ", "_")
294
 
295
- results_df_name = "semantic_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
296
 
297
  print("Saving search output to file")
298
  progress(0.7, desc = "Saving search output to file")
@@ -589,7 +589,7 @@ def chroma_retrieval_deprecated(query_str:str, vectorstore, docs, orig_df_col:st
589
 
590
  results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
591
 
592
- results_df_name = "semantic_search_result.csv"
593
  results_df_out.to_csv(results_df_name, index= None)
594
  results_first_text = results_df_out[orig_df_col].iloc[0]
595
 
 
292
 
293
  query_str_file = query_str.replace(" ", "_")
294
 
295
+ results_df_name = "output/semantic_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
296
 
297
  print("Saving search output to file")
298
  progress(0.7, desc = "Saving search output to file")
 
589
 
590
  results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
591
 
592
+ results_df_name = "output/semantic_search_result.csv"
593
  results_df_out.to_csv(results_df_name, index= None)
594
  results_first_text = results_df_out[orig_df_col].iloc[0]
595
 
search_funcs/semantic_ingest_functions.py CHANGED
@@ -304,7 +304,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
304
 
305
  if clean == "No":
306
  #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
307
- out_doc_file_name = file_name + "_prepared_docs.pkl.gz"
308
  with gzip.open(out_doc_file_name, 'wb') as file:
309
  pickle.dump(doc_sections, file)
310
 
@@ -312,7 +312,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
312
  elif clean == "Yes":
313
  #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
314
 
315
- out_doc_file_name = file_name + "_cleaned_prepared_docs.pkl.gz"
316
  with gzip.open(out_doc_file_name, 'wb') as file:
317
  pickle.dump(doc_sections, file)
318
 
 
304
 
305
  if clean == "No":
306
  #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
307
+ out_doc_file_name = "output/" + file_name + "_prepared_docs.pkl.gz"
308
  with gzip.open(out_doc_file_name, 'wb') as file:
309
  pickle.dump(doc_sections, file)
310
 
 
312
  elif clean == "Yes":
313
  #pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
314
 
315
+ out_doc_file_name = "output/" + file_name + "_cleaned_prepared_docs.pkl.gz"
316
  with gzip.open(out_doc_file_name, 'wb') as file:
317
  pickle.dump(doc_sections, file)
318