Spaces:

seanpedrickcase
/

data_text_search

Sleeping

App Files Files Community

seanpedrickcase commited on Jul 3

Commit

ea0dd40

•

1 Parent(s): 2806807

Changed embedding model to MiniLM-L6 as faster. Compressed embeddings are now int8. General improvements to API mode

Browse files

Files changed (12) hide show

Dockerfile +9 -13
app.py +16 -11
download_model.py +15 -0
output/36de65711121889ccdcb768b85e97e386d8fe4bd/keyword_search_result_20240702_school.xlsm +0 -0
requirements.txt +1 -1
requirements_gpu.txt +1 -1
search_funcs/bm25_functions.py +53 -48
search_funcs/clean_funcs.py +38 -17
search_funcs/helper_functions.py +19 -4
search_funcs/semantic_functions.py +149 -71
search_funcs/semantic_ingest_functions.py +32 -157
search_funcs/spacy_search_funcs.py +3 -14

Dockerfile CHANGED Viewed

@@ -1,11 +1,8 @@
 # First stage: build dependencies
-#FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
-# Trying Python 10 as I saw somewhere that Python 11 may result in corrupted openpyxl xlsx outputs
-FROM public.ecr.aws/docker/library/python:3.10.14-slim-bookworm
-# Install Lambda web adapter in case you want to run with with an AWS Lamba function URL
-COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.3 /lambda-adapter /opt/extensions/lambda-adapter
 # Install wget
 RUN apt-get update && apt-get install -y wget
@@ -20,14 +17,12 @@ COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # Gradio needs to be installed after due to conflict with spacy in requirements
-RUN pip install --no-cache-dir gradio==4.36.1
-# Download the BGE embedding model during the build process
-RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
-RUN apt-get install git-lfs -y
-RUN git lfs install
-RUN git clone https://huggingface.co/BAAI/bge-small-en-v1.5 /model/bge
-RUN rm -rf /model/bge/.git
 # Set up a new user named "user" with user ID 1000
 RUN useradd -m -u 1000 user
@@ -47,6 +42,7 @@ ENV HOME=/home/user \
 	PATH=/home/user/.local/bin:$PATH \
     PYTHONPATH=$HOME/app \
 	PYTHONUNBUFFERED=1 \
 	GRADIO_ALLOW_FLAGGING=never \
 	GRADIO_NUM_PORTS=1 \
 	GRADIO_SERVER_NAME=0.0.0.0 \

 # First stage: build dependencies
+FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
+# Optional - install Lambda web adapter in case you want to run with with an AWS Lamba function URL
+# COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.3 /lambda-adapter /opt/extensions/lambda-adapter
 # Install wget
 RUN apt-get update && apt-get install -y wget
 RUN pip install --no-cache-dir -r requirements.txt
 # Gradio needs to be installed after due to conflict with spacy in requirements
+RUN pip install --no-cache-dir gradio==4.37.2
+# Download the BGE embedding model during the build process. Create a directory for the model and download specific files using huggingface_hub
+RUN mkdir -p /model/minilm
+COPY download_model.py /src/download_model.py
+RUN python /src/download_model.py
 # Set up a new user named "user" with user ID 1000
 RUN useradd -m -u 1000 user
 	PATH=/home/user/.local/bin:$PATH \
     PYTHONPATH=$HOME/app \
 	PYTHONUNBUFFERED=1 \
+	PYTHONDONTWRITEBYTECODE=1 \
 	GRADIO_ALLOW_FLAGGING=never \
 	GRADIO_NUM_PORTS=1 \
 	GRADIO_SERVER_NAME=0.0.0.0 \

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ PandasDataFrame = Type[pd.DataFrame]
 from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
 from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
-from search_funcs.semantic_functions import docs_to_bge_embed_np_array, bge_semantic_search
 from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder, get_connection_params, output_folder
 from search_funcs.spacy_search_funcs import spacy_fuzzy_search
 from search_funcs.aws_functions import load_data_from_aws
@@ -24,24 +24,29 @@ with app:
     # BM25 state objects
     orig_keyword_data_state = gr.State(pd.DataFrame()) # Original data that is not changed #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
     prepared_keyword_data_state = gr.State(pd.DataFrame()) # Data frame the contains modified data #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
     #tokenised_prepared_keyword_data_state = gr.State([]) # This is data that has been loaded in as tokens #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State()
     tokenised_prepared_keyword_data_state = gr.State([]) # Data that has been prepared for search (tokenised) #gr.Dataframe(np.array([]), type="array", visible=False) #gr.State([])
-    bm25_search_index_state = gr.State()
     # Semantic search state objects
     orig_semantic_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(),visible=False) # gr.State(pd.DataFrame())
     semantic_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(),visible=False) # gr.State(pd.DataFrame())
     semantic_input_document_format = gr.State([])
     embeddings_state = gr.State(np.array([])) #gr.Dataframe(np.array([]), type="numpy", visible=False) #gr.State(np.array([])) # globals()["embeddings"]
     semantic_k_val = gr.Number(9999, visible=False)
     # State objects for app in general
     session_hash_state = gr.State("")
     s3_output_folder_state = gr.State("")
     join_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(), visible=False) #gr.State(pd.DataFrame())
-    output_file_state = gr.Dropdown([], visible=False, allow_custom_value=True) #gr.Dataframe(type="array", visible=False) #gr.State([])
     # Informational state objects
     in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
@@ -95,7 +100,7 @@ depends on factors such as the type of documents or queries. Information taken f
     """
     **Thematic/semantic search**
-    This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
     """)
         with gr.Row():
@@ -122,7 +127,7 @@ depends on factors such as the type of documents or queries. Information taken f
             with gr.Row():
                 in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
                 return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
-                embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
             #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
         with gr.Accordion(label="Keyword search options", open = False):
             with gr.Row():
@@ -194,12 +199,12 @@ depends on factors such as the type of documents or queries. Information taken f
     # Load in a csv/excel file for semantic search
     in_semantic_file.change(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column,  search_df_join_column,  semantic_data_state, orig_semantic_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, semantic_load_progress, current_source_semantic])
     load_semantic_data_button.click(
-        csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[semantic_input_document_format, semantic_load_progress, output_file_state]).\
-        then(docs_to_bge_embed_np_array, inputs=[semantic_input_document_format, in_semantic_file, embeddings_state, output_file_state, in_clean_data, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, embeddings_state, semantic_output_file, output_file_state]) #  vectorstore_state
     # Semantic search query
-    semantic_submit.click(bge_semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic_search")
-    semantic_query.submit(bge_semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
     app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state])

 from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
 from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
+from search_funcs.semantic_functions import load_embedding_model, docs_to_bge_embed_np_array, bge_semantic_search
 from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder, get_connection_params, output_folder
 from search_funcs.spacy_search_funcs import spacy_fuzzy_search
 from search_funcs.aws_functions import load_data_from_aws
     # BM25 state objects
     orig_keyword_data_state = gr.State(pd.DataFrame()) # Original data that is not changed #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
+    #orig_keyword_data_state = gr.State(pd.DataFrame()) # Original data that is not changed #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
     prepared_keyword_data_state = gr.State(pd.DataFrame()) # Data frame the contains modified data #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State(pd.DataFrame())
     #tokenised_prepared_keyword_data_state = gr.State([]) # This is data that has been loaded in as tokens #gr.Dataframe(pd.DataFrame(),visible=False) #gr.State()
     tokenised_prepared_keyword_data_state = gr.State([]) # Data that has been prepared for search (tokenised) #gr.Dataframe(np.array([]), type="array", visible=False) #gr.State([])
+    bm25_search_index_state = gr.State()
     # Semantic search state objects
     orig_semantic_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(),visible=False) # gr.State(pd.DataFrame())
     semantic_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(),visible=False) # gr.State(pd.DataFrame())
     semantic_input_document_format = gr.State([])
+    embeddings_model_name_state = gr.State("sentence-transformers/all-MiniLM-L6-v2")#"BAAI/bge-small-en-v1.5")
+    embeddings_model_loc_state = gr.State("minilm/")#"bge/")
     embeddings_state = gr.State(np.array([])) #gr.Dataframe(np.array([]), type="numpy", visible=False) #gr.State(np.array([])) # globals()["embeddings"]
+    embeddings_model_state = gr.State()
+    torch_device_state = gr.State("cpu")
     semantic_k_val = gr.Number(9999, visible=False)
     # State objects for app in general
     session_hash_state = gr.State("")
     s3_output_folder_state = gr.State("")
     join_data_state = gr.State(pd.DataFrame()) #gr.Dataframe(pd.DataFrame(), visible=False) #gr.State(pd.DataFrame())
+    output_file_state = gr.State([]) #gr.Dataframe(type="array", visible=False) #gr.State([])
     # Informational state objects
     in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
     """
     **Thematic/semantic search**
+    This search type enables you to search for general terms (e.g. happiness, nature) and the search will pick out text passages that are most semantically similar to them. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
     """)
         with gr.Row():
             with gr.Row():
                 in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
                 return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
+                embeddings_compress = gr.Dropdown(label = "Round embeddings to int8 precision for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
             #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
         with gr.Accordion(label="Keyword search options", open = False):
             with gr.Row():
     # Load in a csv/excel file for semantic search
     in_semantic_file.change(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column,  search_df_join_column,  semantic_data_state, orig_semantic_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, semantic_load_progress, current_source_semantic])
     load_semantic_data_button.click(
+        csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[semantic_input_document_format, semantic_load_progress, output_file_state], api_name="convert_texts_to_documents").\
+        then(docs_to_bge_embed_np_array, inputs=[semantic_input_document_format, in_semantic_file, output_file_state, in_clean_data, embeddings_state, embeddings_model_name_state, embeddings_model_loc_state, return_intermediate_files, embeddings_compress], outputs=[semantic_load_progress, embeddings_state, semantic_output_file, output_file_state, embeddings_model_state], api_name="embed_documents")
     # Semantic search query
+    semantic_submit.click(bge_semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, embeddings_model_state, embeddings_model_name_state, embeddings_compress, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file], api_name="semantic_search")
+    semantic_query.submit(bge_semantic_search, inputs=[semantic_query, embeddings_state, semantic_input_document_format, semantic_k_val, semantic_min_distance, embeddings_model_state, embeddings_model_name_state, embeddings_compress, join_data_state, in_join_column, search_df_join_column], outputs=[semantic_output_single_text, semantic_output_file])
     app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state])

download_model.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from huggingface_hub import hf_hub_download
+# Define the repository and files to download
+repo_id = "sentence-transformers/all-MiniLM-L6-v2" #"BAAI/bge-small-en-v1.5"
+files_to_download = [
+    "config.json",
+    "pytorch_model.bin",
+    "tokenizer_config.json",
+    "vocab.txt"
+]
+# Download each file and save it to the /model/bge directory
+for file_name in files_to_download:
+    print("Checking for file", file_name)
+    hf_hub_download(repo_id=repo_id, filename=file_name, local_dir="/model/minilm") #"/model/bge"

output/36de65711121889ccdcb768b85e97e386d8fe4bd/keyword_search_result_20240702_school.xlsm ADDED Viewed

Binary file (9.92 kB). View file

requirements.txt CHANGED Viewed

@@ -2,7 +2,7 @@ pandas==2.2.2
 polars==0.20.3
 pyarrow==14.0.2
 openpyxl==3.1.3
-torch==2.3.1
 spacy
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
 gradio

 polars==0.20.3
 pyarrow==14.0.2
 openpyxl==3.1.3
+torch==2.3.1 --index-url https://download.pytorch.org/whl/cpu
 spacy
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
 gradio

requirements_gpu.txt CHANGED Viewed

@@ -2,7 +2,7 @@ pandas==2.2.2
 polars==0.20.3
 pyarrow==14.0.2
 openpyxl==3.1.3
-torch==2.3.1 --index-url https://download.pytorch.org/whl/cu121
 spacy
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
 gradio

 polars==0.20.3
 pyarrow==14.0.2
 openpyxl==3.1.3
+torch==2.4.0 --index-url https://download.pytorch.org/whl/nightly/cu121
 spacy
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
 gradio

search_funcs/bm25_functions.py CHANGED Viewed

@@ -15,28 +15,7 @@ from datetime import datetime
 today_rev = datetime.now().strftime("%Y%m%d")
 from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
-from search_funcs.helper_functions import get_file_path_end_with_ext, get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists, output_folder
-# Load the SpaCy model
-from spacy.cli.download import download
-import spacy
-spacy.prefer_gpu()
-#os.system("python -m spacy download en_core_web_sm")
-try:
-	import en_core_web_sm
-	nlp = en_core_web_sm.load()
-	print("Successfully imported spaCy model")
-    #nlp = spacy.load("en_core_web_sm")
-    #print(nlp._path)
-except:
-	download("en_core_web_sm")
-	nlp = spacy.load("en_core_web_sm")
-	print("Successfully imported spaCy model")
-    #print(nlp._path)
-# including punctuation rules and exceptions
-tokenizer = nlp.tokenizer
 PARAM_K1 = 1.5
 PARAM_B = 0.75
@@ -230,6 +209,35 @@ class BM25:
 		with open(f"{output_folder}{filename}.pkl", "rb") as fsave:
 			return pickle.load(fsave)
 def prepare_bm25_input_data(
     in_file: list,
     text_column: str,
@@ -348,9 +356,8 @@ def prepare_bm25_input_data(
 	else:
 		tokeniser_tic = time.perf_counter()
 		prepared_search_text_list = []
-		batch_size = 256
-		for doc in tokenizer.pipe(progress.tqdm(prepared_text_as_list, desc = "Tokenising text", unit = "rows"), batch_size=batch_size):
-			prepared_search_text_list.append([token.text for token in doc])
 		tokeniser_toc = time.perf_counter()
 		tokenizer_time_out = f"Tokenising the text took {tokeniser_toc - tokeniser_tic:0.1f} seconds."
@@ -519,26 +526,18 @@ def prepare_bm25(
 	return message, None, bm25, prepared_search_text_list
-def convert_bm25_query_to_tokens(free_text_query, clean="No"):
-    '''
-    Split open text query into tokens and then lemmatise to get the core of the word. Currently 'clean' has no effect.
-    '''
-    if clean=="Yes":
-        split_query = tokenizer(free_text_query.lower())
-        out_query = [token.text for token in split_query]
-        #out_query = stem_sentence(out_query)
-    else:
-        split_query = tokenizer(free_text_query.lower())
-        out_query = [token.text for token in split_query]
-    print("Search query out is:", out_query)
-    if isinstance(out_query,str):
-        print("Converting string")
-        out_query = [out_query]
-    return out_query
 def bm25_search(
     free_text_query: str,
@@ -596,9 +595,11 @@ def bm25_search(
 	Returns
 	-------
 	tuple
-		A tuple containing a message, the search results file name (if any), the BM25 object, and the prepared search text list.
 	"""
 	progress(0, desc = "Conducting keyword search")
 	print("in_join_file at start of bm25_search:", in_join_file)
@@ -611,10 +612,7 @@ def bm25_search(
 	# print("bm25:", bm25)
 	# Prepare query
-	if (clean == "Yes") | (text_column.endswith("_cleaned")):
-		token_query = convert_bm25_query_to_tokens(free_text_query, clean="Yes")
-	else:
-		token_query = convert_bm25_query_to_tokens(free_text_query, clean="No")
 	# Perform search
 	print("Searching")
@@ -685,6 +683,13 @@ def bm25_search(
 	results_first_text = results_df_out[text_column].iloc[0]
 	print("Returning results")
-	return results_first_text, results_df_name

 today_rev = datetime.now().strftime("%Y%m%d")
 from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
+from search_funcs.helper_functions import get_file_path_end_with_ext, get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists, output_folder, load_spacy_model
 PARAM_K1 = 1.5
 PARAM_B = 0.75
 		with open(f"{output_folder}{filename}.pkl", "rb") as fsave:
 			return pickle.load(fsave)
+def tokenise_text_spacy(prepared_text_as_list:List[str], progress = gr.Progress(track_tqdm=True)):
+	'''
+	Tokenise a list of texts using the spaCy package and the en_core_web_sm model.
+	'''
+	# Load spaCy model
+	nlp = load_spacy_model()
+	prepared_search_text_list = []
+	batch_size = 256
+	for doc in nlp.tokenizer.pipe(progress.tqdm(prepared_text_as_list, desc = "Tokenising text", unit = "rows"), batch_size=batch_size):
+		prepared_search_text_list.append([token.text for token in doc])
+	return prepared_search_text_list
+def tokenise_text_nltk(prepared_text_as_list: List[str], progress= gr.Progress(track_tqdm=True)):
+	"""
+	Tokenise a list of texts using the NLTK package.
+	"""
+	import nltk
+	nltk.download('punkt', quiet=True)  # Download the necessary resource if not already present
+	prepared_search_text_list = []
+	for text in progress.tqdm(prepared_text_as_list, desc="Tokenising text", unit="rows"):
+		prepared_search_text_list.append(nltk.word_tokenize(text.lower()))  # Lowercase for consistency
+	return prepared_search_text_list
 def prepare_bm25_input_data(
     in_file: list,
     text_column: str,
 	else:
 		tokeniser_tic = time.perf_counter()
 		prepared_search_text_list = []
+		prepared_search_text_list = tokenise_text_spacy(prepared_text_as_list)
 		tokeniser_toc = time.perf_counter()
 		tokenizer_time_out = f"Tokenising the text took {tokeniser_toc - tokeniser_tic:0.1f} seconds."
 	return message, None, bm25, prepared_search_text_list
+def convert_bm25_query_to_tokens(free_text_query:str):
+	"""
+	Split open text query into tokens.
+	"""
+	split_query = tokenise_text_spacy([free_text_query.lower()])
+	# Flatten the list of lists into a single list
+	flattened_query = [token for sublist in split_query for token in sublist]
+	print("Search query out is:", flattened_query)
+	return flattened_query
 def bm25_search(
     free_text_query: str,
 	Returns
 	-------
 	tuple
+		A tuple containing a message and the output search results files (if any).
 	"""
+	output_files = []
 	progress(0, desc = "Conducting keyword search")
 	print("in_join_file at start of bm25_search:", in_join_file)
 	# print("bm25:", bm25)
 	# Prepare query
+	token_query = convert_bm25_query_to_tokens(free_text_query)
 	# Perform search
 	print("Searching")
 	results_first_text = results_df_out[text_column].iloc[0]
+	output_files.append(results_df_name)
+	csv_output_file = output_folder + "keyword_search_result_" + today_rev + "_" +  query_str_file + ".csv"
+	results_df_out.to_csv(csv_output_file, index=None)
+	output_files.append(csv_output_file)
 	print("Returning results")
+	return results_first_text, output_files

search_funcs/clean_funcs.py CHANGED Viewed

@@ -1,13 +1,9 @@
 # ## Some functions to clean text
 import re
 import string
-import polars as pl
 # Add calendar months onto stop words
 import calendar
-#from tqdm import tqdm
-import gradio as gr
 from typing import List
@@ -15,7 +11,6 @@ from typing import List
 custom_words = []
 my_stop_words = custom_words
 cal_month = (list(calendar.month_name))
 cal_month = [x.lower() for x in cal_month]
@@ -24,7 +19,6 @@ cal_month = [i for i in cal_month if i]
 #print(cal_month)
 custom_words.extend(cal_month)
 # #### Some of my cleaning functions
 replace_backslash = r'\\'
 email_start_pattern_regex = r'.*importance:|.*subject:'
@@ -37,19 +31,19 @@ warning_pattern_regex = r'caution: this email originated from outside of the org
 nbsp_pattern_regex = r'&nbsp;'
 multiple_spaces_regex = r'\s{2,}'
-# Pre-compiling the regular expressions for efficiency
-# email_start_pattern = re.compile(email_start_pattern_regex)
-# email_end_pattern = re.compile(email_end_pattern_regex)
-# html_pattern = re.compile(html_pattern_regex)
-# email_pattern = re.compile(email_end_pattern_regex)
-# num_pattern = re.compile(num_pattern_regex)
-# postcode_pattern = re.compile(postcode_pattern_regex)
-# warning_pattern = re.compile(warning_pattern_regex)
-# nbsp_pattern = re.compile(nbsp_pattern_regex)
-def initial_clean(texts:List[str] , progress=gr.Progress()):
-    texts = pl.Series(texts)#[]
     text = texts.str.replace_all(replace_backslash, '/')
     text = text.str.replace_all(html_pattern_regex, '')
@@ -62,6 +56,33 @@ def initial_clean(texts:List[str] , progress=gr.Progress()):
     return text
 def remove_hyphens(text_text):
     return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', text_text)

 # ## Some functions to clean text
 import re
 import string
 # Add calendar months onto stop words
 import calendar
 from typing import List
 custom_words = []
 my_stop_words = custom_words
 cal_month = (list(calendar.month_name))
 cal_month = [x.lower() for x in cal_month]
 #print(cal_month)
 custom_words.extend(cal_month)
 # #### Some of my cleaning functions
 replace_backslash = r'\\'
 email_start_pattern_regex = r'.*importance:|.*subject:'
 nbsp_pattern_regex = r'&nbsp;'
 multiple_spaces_regex = r'\s{2,}'
+def initial_clean(texts:List[str]):
+    """
+    This function cleans a list of text strings by performing various replacements using polars.
+    Args:
+        texts (List[str]): A list of strings to clean.
+    Returns:
+        List[str]: A list of cleaned strings.
+    """
+    import polars as pl
+    texts = pl.Series(texts)
     text = texts.str.replace_all(replace_backslash, '/')
     text = text.str.replace_all(html_pattern_regex, '')
     return text
+def initial_clean_pandas(texts: List[str]):
+    """
+    This function cleans a list of text strings by performing various replacements using pandas.
+    Args:
+        texts (List[str]): A list of strings to clean.
+    Returns:
+        List[str]: A list of cleaned strings.
+    """
+    import pandas as pd
+    # Create a pandas Series from the text list for easier manipulation
+    text_series = pd.Series(texts)
+    # Replace patterns with pandas string methods (`.str.replace`)
+    text_series = text_series.astype(str).str.replace(replace_backslash, '/', regex=True)
+    text_series = text_series.astype(str).str.replace(html_pattern_regex, '', regex=True)
+    text_series = text_series.astype(str).str.replace(email_start_pattern_regex, '', regex=True)
+    text_series = text_series.astype(str).str.replace(email_end_pattern_regex, '', regex=True)
+    text_series = text_series.astype(str).str.replace(email_pattern_regex, '', regex=True)
+    text_series = text_series.astype(str).str.replace(multiple_spaces_regex, ' ', regex=True)
+    # Convert cleaned Series back to a list
+    return text_series.tolist()
 def remove_hyphens(text_text):
     return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', text_text)

search_funcs/helper_functions.py CHANGED Viewed

@@ -67,7 +67,7 @@ def get_connection_params(request: gr.Request):
             #print("Query parameters:", dict(request.query_params))
             # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
             #print("Request dictionary to object:", request.request.body())
-            print("Session hash:", request.session_hash)
             if 'x-cognito-id' in request.headers:
                 out_session_hash = request.headers['x-cognito-id']
@@ -77,11 +77,11 @@ def get_connection_params(request: gr.Request):
             else:
                 out_session_hash = request.session_hash
                 base_folder = "temp-files/"
-                print("Cognito ID not found. Using session hash as save folder.")
             output_folder = base_folder + out_session_hash + "/"
-            if bucket_name:
-                print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
             return out_session_hash, output_folder
         else:
@@ -281,6 +281,21 @@ def put_columns_in_join_df(in_file:str):
     return gr.Dropdown(choices=concat_choices), new_df, out_message
 def display_info(info_component):
     gr.Info(info_component)

             #print("Query parameters:", dict(request.query_params))
             # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
             #print("Request dictionary to object:", request.request.body())
+            #print("Session hash:", request.session_hash)
             if 'x-cognito-id' in request.headers:
                 out_session_hash = request.headers['x-cognito-id']
             else:
                 out_session_hash = request.session_hash
                 base_folder = "temp-files/"
+                #print("Cognito ID not found. Using session hash as save folder.")
             output_folder = base_folder + out_session_hash + "/"
+            #if bucket_name:
+            #    print("S3 output folder is: " + "s3://" + bucket_name + "/" + output_folder)
             return out_session_hash, output_folder
         else:
     return gr.Dropdown(choices=concat_choices), new_df, out_message
+def load_spacy_model():
+	# Load the SpaCy model
+	from spacy.cli.download import download
+	import spacy
+	spacy.prefer_gpu()
+	try:
+		import en_core_web_sm
+		nlp = en_core_web_sm.load()
+		print("Successfully imported spaCy model")
+	except:
+		download("en_core_web_sm")
+		nlp = spacy.load("en_core_web_sm")
+		print("Successfully imported spaCy model")
+	return nlp
 def display_info(info_component):
     gr.Info(info_component)

search_funcs/semantic_functions.py CHANGED Viewed

@@ -6,58 +6,61 @@ import gradio as gr
 import numpy as np
 from datetime import datetime
 from search_funcs.helper_functions import get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists, output_folder
-from torch import cuda, backends
-from sentence_transformers import SentenceTransformer
 PandasDataFrame = Type[pd.DataFrame]
 today_rev = datetime.now().strftime("%Y%m%d")
-# Check for torch cuda
-print("Is CUDA enabled? ", cuda.is_available())
-print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
-if cuda.is_available():
-    torch_device = "cuda"
-    os.system("nvidia-smi")
-else:
-    torch_device =  "cpu"
-print("Device used is: ", torch_device)
-# Load embeddings
-embeddings_name = "BAAI/bge-small-en-v1.5"
-# Define a list of possible local locations to search for the model
-local_embeddings_locations = [
-    "model/bge/", # Potential local location
-    "/model/bge/", # Potential location in Docker container
-    "/home/user/app/model/bge/" # This is inside a Docker container
-]
-# Attempt to load the model from each local location
-for location in local_embeddings_locations:
-    try:
-        embeddings_model = SentenceTransformer(location)
-        print(f"Found local model installation at: {location}")
-        break  # Exit the loop if the model is found
-    except Exception as e:
-        print(f"Failed to load model from {location}: {e}")
-        continue
-else:
-    # If the loop completes without finding the model in any local location
-    embeddings_model = SentenceTransformer(embeddings_name)
-    print("Could not find local model installation. Downloading from Huggingface")
 def docs_to_bge_embed_np_array(
     docs_out: list,
     in_file: list,
-    embeddings_state: np.ndarray,
     output_file_state: str,
-    clean: str,
     return_intermediate_files: str = "No",
-    embeddings_super_compress: str = "No",
-    embeddings_model: SentenceTransformer = embeddings_model,
     progress: gr.Progress = gr.Progress(track_tqdm=True)
 ) -> tuple:
     """
@@ -66,18 +69,20 @@ def docs_to_bge_embed_np_array(
     Parameters:
     - docs_out (list): List of documents to be embedded.
     - in_file (list): List of input files.
-    - embeddings_state (np.ndarray): Current state of embeddings.
     - output_file_state (str): State of the output file.
     - clean (str): Indicates if the data should be cleaned.
     - return_intermediate_files (str, optional): Whether to return intermediate files. Default is "No".
-    - embeddings_super_compress (str, optional): Whether to super compress the embeddings. Default is "No".
-    - embeddings_model (SentenceTransformer, optional): The embeddings model to use. Default is embeddings_model.
     - progress (gr.Progress, optional): Progress tracker for the function. Default is gr.Progress(track_tqdm=True).
     Returns:
     - tuple: A tuple containing the output message, embeddings, and output file state.
     """
     ensure_output_folder_exists(output_folder)
@@ -102,12 +107,29 @@ def docs_to_bge_embed_np_array(
     out_message = "Document processing complete. Ready to search."
     if embeddings_state.size == 0:
         tic = time.perf_counter()
         print("Starting to embed documents.")
-        embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = 32, normalize_embeddings=True) # For BGE
         toc = time.perf_counter()
         time_out = f"The embedding took {toc - tic:0.1f} seconds"
@@ -119,27 +141,25 @@ def docs_to_bge_embed_np_array(
             else: data_file_name_no_ext = data_file_name_no_ext
             progress(0.9, desc = "Saving embeddings to file")
-            if embeddings_super_compress == "No":
                 semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embeddings.npz'
-                np.savez_compressed(semantic_search_file_name, embeddings_out)
             else:
                 semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embedding_compress.npz'
-                embeddings_out_round = np.round(embeddings_out, 3)
-                embeddings_out_round *= 100 # Rounding not currently used
-                np.savez_compressed(semantic_search_file_name, embeddings_out_round)
             output_file_state.append(semantic_search_file_name)
-            return out_message, embeddings_out, output_file_state, output_file_state
-        return out_message, embeddings_out, output_file_state, output_file_state
     else:
         # Just return existing embeddings if already exist
         embeddings_out = embeddings_state
     print(out_message)
-    return out_message, embeddings_out, output_file_state, output_file_state
 def process_data_from_scores_df(
     df_docs: pd.DataFrame,
@@ -226,14 +246,15 @@ def bge_semantic_search(
     embeddings: np.ndarray,
     documents: list,
     k_val: int,
-    vec_score_cut_off: float,
     in_join_file: pd.DataFrame,
     in_join_column: str = None,
-    search_df_join_column: str = None,
-    device: str = torch_device,
-    embeddings_model: SentenceTransformer = embeddings_model,
     progress: gr.Progress = gr.Progress(track_tqdm=True)
-) -> pd.DataFrame:
     """
     Perform a semantic search using the BGE model.
@@ -243,33 +264,83 @@ def bge_semantic_search(
     - documents (list): The list of documents to search.
     - k_val (int): The number of top results to return.
     - vec_score_cut_off (float): The score cutoff for filtering results.
     - in_join_file (pd.DataFrame): The DataFrame to join with the search results.
     - in_join_column (str, optional): The column name in the join DataFrame to join on. Default is None.
-    - search_df_join_column (str, optional): The column name in the search DataFrame to join on. Default is None.
-    - device (str, optional): The device to run the model on. Default is torch_device.
-    - embeddings_model (SentenceTransformer, optional): The embeddings model to use. Default is embeddings_model.
     - progress (gr.Progress, optional): Progress tracker for the function. Default is gr.Progress(track_tqdm=True).
     Returns:
-    - pd.DataFrame: The DataFrame containing the search results.
     """
     progress(0, desc = "Conducting semantic search")
     ensure_output_folder_exists(output_folder)
     print("Searching")
-    # Load the sentence transformer model and move it to GPU
-    embeddings_model = embeddings_model.to(device)
     # Encode the query using the sentence transformer and convert to a PyTorch tensor
-    query = embeddings_model.encode(query_str, normalize_embeddings=True)
-    # Sentence transformers method, not used:
-    cosine_similarities = query @ embeddings.T
-    # Flatten the tensor to a 1D array
     cosine_similarities = cosine_similarities.flatten()
     # Create a Pandas Series
@@ -309,6 +380,13 @@ def bge_semantic_search(
     #results_df_out.to_excel(results_df_name, index= None)
     results_first_text = results_df_out.iloc[0, 1]
     print("Returning results")
     return results_first_text, results_df_name

 import numpy as np
 from datetime import datetime
 from search_funcs.helper_functions import get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists, output_folder
 PandasDataFrame = Type[pd.DataFrame]
 today_rev = datetime.now().strftime("%Y%m%d")
+def load_embedding_model(embeddings_name = "BAAI/bge-small-en-v1.5", embedding_loc="bge/"):
+    from torch import cuda, backends
+    from sentence_transformers import SentenceTransformer
+    # Check for torch cuda
+    print("Is CUDA enabled? ", cuda.is_available())
+    print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
+    if cuda.is_available():
+        torch_device = "cuda"
+        #os.system("nvidia-smi")
+    else:
+        torch_device =  "cpu"
+    print("Device used is: ", torch_device)
+    # Define a list of possible local locations to search for the model
+    local_embeddings_locations = [
+        "model/" + embedding_loc, # Potential local location
+        "/model/" + embedding_loc, # Potential location in Docker container
+        "/home/user/app/model/" + embedding_loc # This is inside a Docker container
+    ]
+    # Attempt to load the model from each local location
+    for location in local_embeddings_locations:
+        try:
+            embeddings_model = SentenceTransformer(location)
+            print(f"Found local model installation at: {location}")
+            break  # Exit the loop if the model is found
+        except Exception as e:
+            print(f"Failed to load model from {location}: {e}")
+            continue
+    else:
+        # If the loop completes without finding the model in any local location
+        embeddings_model = SentenceTransformer(embeddings_name)
+        print("Could not find local model installation. Downloading from Huggingface")
+    # Load the sentence transformer model and move it to CPU/GPU
+    embeddings_model = embeddings_model.to(torch_device)
+    return embeddings_model, torch_device
 def docs_to_bge_embed_np_array(
     docs_out: list,
     in_file: list,
     output_file_state: str,
+    clean: str,
+    embeddings_state: np.ndarray,
+    embeddings_model_name:str,
+    embeddings_model_loc:str,
     return_intermediate_files: str = "No",
+    embeddings_compress: str = "No",
     progress: gr.Progress = gr.Progress(track_tqdm=True)
 ) -> tuple:
     """
     Parameters:
     - docs_out (list): List of documents to be embedded.
     - in_file (list): List of input files.
     - output_file_state (str): State of the output file.
     - clean (str): Indicates if the data should be cleaned.
+    - embeddings_state (np.ndarray): Current state of embeddings.
+    - embeddings_model_name (str): The Huggingface repo name of the embeddings model.
+    - embeddings_model_loc (str): Embeddings model save location.
     - return_intermediate_files (str, optional): Whether to return intermediate files. Default is "No".
+    - embeddings_compress (str, optional): Whether to compress the embeddings to int8 precision. Default is "No".
     - progress (gr.Progress, optional): Progress tracker for the function. Default is gr.Progress(track_tqdm=True).
     Returns:
     - tuple: A tuple containing the output message, embeddings, and output file state.
     """
+    embeddings_model, torch_device = load_embedding_model(embeddings_model_name, embeddings_model_loc)
     ensure_output_folder_exists(output_folder)
     out_message = "Document processing complete. Ready to search."
     if embeddings_state.size == 0:
         tic = time.perf_counter()
         print("Starting to embed documents.")
+        # Encode embeddings. If in normal mode, float32, if in 'super compress' mode, int8
+        batch_size = 32
+        if "bge" in embeddings_model_name:
+            print("Embedding with BGE model")
+            if embeddings_compress == "No":
+                print("Embedding with full fp32 precision")
+                embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, normalize_embeddings=True)
+            else:
+                print("Embedding with int8 precision")
+                embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, normalize_embeddings=True, precision="int8")
+        else:
+            print("Embedding with MiniLM-L6-v2 model")
+            if embeddings_compress == "No":
+                print("Embedding with full fp32 precision")
+                embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size)
+            else:
+                print("Embedding with int8 precision")
+                embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, precision="int8")
         toc = time.perf_counter()
         time_out = f"The embedding took {toc - tic:0.1f} seconds"
             else: data_file_name_no_ext = data_file_name_no_ext
             progress(0.9, desc = "Saving embeddings to file")
+            if embeddings_compress == "No":
                 semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embeddings.npz'
             else:
                 semantic_search_file_name = output_folder + data_file_name_no_ext + '_bge_embedding_compress.npz'
+            np.savez_compressed(semantic_search_file_name, embeddings_out)
             output_file_state.append(semantic_search_file_name)
+            return out_message, embeddings_out, output_file_state, output_file_state, embeddings_model
+        return out_message, embeddings_out, output_file_state, output_file_state, embeddings_model
     else:
         # Just return existing embeddings if already exist
         embeddings_out = embeddings_state
     print(out_message)
+    return out_message, embeddings_out, output_file_state, output_file_state, embeddings_model
 def process_data_from_scores_df(
     df_docs: pd.DataFrame,
     embeddings: np.ndarray,
     documents: list,
     k_val: int,
+    vec_score_cut_off: float,
+    embeddings_model,
+    embeddings_model_name: str,
+    embeddings_compress:str,
     in_join_file: pd.DataFrame,
     in_join_column: str = None,
+    search_df_join_column: str = None,
     progress: gr.Progress = gr.Progress(track_tqdm=True)
+) -> tuple:
     """
     Perform a semantic search using the BGE model.
     - documents (list): The list of documents to search.
     - k_val (int): The number of top results to return.
     - vec_score_cut_off (float): The score cutoff for filtering results.
+    - embeddings_model (SentenceTransformer, optional): The embeddings model to use.
+    - embeddings_model_name (str): The Huggingface repo name of the embeddings model.
+    - embeddings_compress (str): Whether the embeddings have been compressed to int8 precision
     - in_join_file (pd.DataFrame): The DataFrame to join with the search results.
     - in_join_column (str, optional): The column name in the join DataFrame to join on. Default is None.
+    - search_df_join_column (str, optional): The column name in the search DataFrame to join on. Default is None.
     - progress (gr.Progress, optional): Progress tracker for the function. Default is gr.Progress(track_tqdm=True).
     Returns:
+    - tuple: The DataFrame containing the search results.
     """
     progress(0, desc = "Conducting semantic search")
+    output_files = []
     ensure_output_folder_exists(output_folder)
     print("Searching")
+    from sentence_transformers import quantize_embeddings
     # Encode the query using the sentence transformer and convert to a PyTorch tensor
+    if "bge" in embeddings_model_name:
+        if embeddings_compress == "Yes":
+            query_fp32 = embeddings_model.encode(query_str, normalize_embeddings=True)
+            #query = query_fp32
+            query = quantize_embeddings(
+            query_fp32,
+            precision="int8",
+            calibration_embeddings=embeddings)
+        else:
+            query = embeddings_model.encode(query_str, normalize_embeddings=True)
+        # Get cosine similarities
+        cosine_similarities = query @ embeddings.T
+        # Sentence transformers method, not used:
+        #cosine_similarities = query @ embeddings.T
+        #cosine_similarities = embeddings_model.similarity(query, embeddings)
+        # Flatten the tensor to a 1D array
+        #cosine_similarities = cosine_similarities.flatten()
+    else:
+        print("Comparing similarity using Minilm-L6-v2")
+        if embeddings_compress == "Yes":
+            query_fp32 = embeddings_model.encode(query_str, normalize_embeddings=True)
+            #query = query_fp32
+            query = quantize_embeddings(
+            query_fp32,
+            precision="int8",
+            calibration_embeddings=embeddings)
+        else:
+            query = embeddings_model.encode(query_str, normalize_embeddings=True)
+        #cosine_similarities = embeddings_model.cosine_similarity(query, embeddings)
+        print("query:", query_fp32)
+        print("embeddings:", embeddings)
+        embeddings_norm = np.linalg.norm(embeddings, axis=1)
+        embeddings_norm = np.linalg.norm(embeddings, axis=1, keepdims=True)  # Keep dims to allow broadcasting
+        normalized_embeddings = embeddings / embeddings_norm
+        print("normalized_embeddings:", normalized_embeddings)
+        expanded_query_fp32 = np.expand_dims(query_fp32, axis=0)
+        cosine_similarities = (expanded_query_fp32 @ normalized_embeddings.T)
+        print("Initial cosine similarities:", cosine_similarities)
+    # Flatten the tensor to a 1D array
     cosine_similarities = cosine_similarities.flatten()
     # Create a Pandas Series
     #results_df_out.to_excel(results_df_name, index= None)
     results_first_text = results_df_out.iloc[0, 1]
+    output_files.append(results_df_name)
+    csv_output_file = output_folder + "semantic_search_result_" + today_rev + "_" +  query_str_file + ".csv"
+    results_df_out.to_csv(csv_output_file, index=None)
+    output_files.append(csv_output_file)
     print("Returning results")
     return results_first_text, results_df_name

search_funcs/semantic_ingest_functions.py CHANGED Viewed

@@ -1,18 +1,15 @@
-# Install/ import packages
 import time
-import re
 import ast
 import gzip
 import pandas as pd
 import gradio as gr
 import pickle
 from typing import Type, List, Literal
-#from langchain.text_splitter import RecursiveCharacterTextSplitter
 from pydantic import BaseModel, Field
 # Creating an alias for pandas DataFrame using Type
 PandasDataFrame = Type[pd.DataFrame]
 class Document(BaseModel):
     """Class for storing a piece of text and associated metadata. Implementation adapted from Langchain code: https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py"""
@@ -25,114 +22,21 @@ class Document(BaseModel):
     """
     type: Literal["Document"] = "Document"
-# Constants for chunking - not currently used
-split_strat = ["\n\n", "\n", ". ", "! ", "? "]
-chunk_size = 512
-chunk_overlap = 0
-start_index = True
-from search_funcs.helper_functions import get_file_path_end_with_ext, detect_file_type, get_file_path_end, ensure_output_folder_exists
 from search_funcs.bm25_functions import save_prepared_bm25_data, output_folder
 from search_funcs.clean_funcs import initial_clean
-def parse_file_not_used(file_paths, text_column='text'):
-    """
-    Accepts a list of file paths, determines each file's type based on its extension,
-    and passes it to the relevant parsing function.
-    Parameters:
-        file_paths (list): List of file paths.
-        text_column (str): Name of the column in CSV/Excel files that contains the text content.
-    Returns:
-        dict: A dictionary with file paths as keys and their parsed content (or error message) as values.
-    """
-    if not isinstance(file_paths, list):
-        raise ValueError("Expected a list of file paths.")
-    extension_to_parser = {
-        # '.pdf': parse_pdf,
-        # '.docx': parse_docx,
-        # '.txt': parse_txt,
-        # '.html': parse_html,
-        # '.htm': parse_html,  # Considering both .html and .htm for HTML files
-        '.csv': lambda file_path: parse_csv_or_excel(file_path, text_column),
-        '.xlsx': lambda file_path: parse_csv_or_excel(file_path, text_column),
-        '.parquet': lambda file_path: parse_csv_or_excel(file_path, text_column)
-    }
-    parsed_contents = {}
-    file_names = []
-    for file_path in file_paths:
-        file_extension = detect_file_type(file_path.name)
-        if file_extension in extension_to_parser:
-            parsed_contents[file_path.name] = extension_to_parser[file_extension](file_path.name)
-        else:
-            parsed_contents[file_path.name] = f"Unsupported file type: {file_extension}"
-        filename_end = get_file_path_end_with_ext(file_path.name)
-        file_names.append(filename_end)
-    return parsed_contents, file_names
-def text_regex_clean(text):
-    # Merge hyphenated words
-        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
-        # If a double newline ends in a letter, add a full stop.
-        text = re.sub(r'(?<=[a-zA-Z])\n\n', '.\n\n', text)
-        # Fix newlines in the middle of sentences
-        text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
-        # Remove multiple newlines
-        text = re.sub(r"\n\s*\n", "\n\n", text)
-        text = re.sub(r"  ", " ", text)
-        # Add full stops and new lines between words with no space between where the second one has a capital letter
-        text = re.sub(r'(?<=[a-z])(?=[A-Z])', '. \n\n', text)
-        return text
-def parse_csv_or_excel(file_path, data_state, text_column = "text"):
-        """
-        Read in a CSV or Excel file.
-        Parameters:
-            file_path (str): Path to the CSV file.
-            text_column (str): Name of the column in the CSV file that contains the text content.
-        Returns:
-            Pandas DataFrame: Dataframe output from file read
-        """
-        file_list = [string.name for string in file_path]
-        #print(file_list)
-        data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
-        data_file_name = data_file_names[0]
-        #for file_path in file_paths:
-        file_name = get_file_path_end_with_ext(data_file_name)
-        message = "Loaded in file. Now converting to document format."
-        print(message)
-        return data_state, file_name, message
-def write_out_metadata_as_string(metadata_in):
-    # If metadata_in is a single dictionary, wrap it in a list
-    if isinstance(metadata_in, dict):
-        metadata_in = [metadata_in]
-    metadata_string = [f"{'  '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
-    return metadata_string
-def combine_metadata_columns(df, cols):
     df['metadata'] = '{'
     df['blank_column'] = ''
@@ -147,32 +51,14 @@ def combine_metadata_columns(df, cols):
     return df['metadata']
-def split_string_into_chunks(input_string, max_length, split_symbols):
-    # Check if input_string or split_symbols are empty
-    if not input_string or not split_symbols:
-        return [input_string]
-    chunks = []
-    current_chunk = ""
-    for char in input_string:
-        current_chunk += char
-        if len(current_chunk) >= max_length or char in split_symbols:
-            # Add the current chunk to the chunks list
-            chunks.append(current_chunk)
-            current_chunk = ""
-    # Adding any remaining part of the string
-    if current_chunk:
-        chunks.append(current_chunk)
-    return chunks
-def clean_line_breaks(text):
-    # Replace \n and \r\n with a space
     return text.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ')
 def parse_metadata(row):
     try:
         # Ensure the 'title' field is a string and clean line breaks
         #if 'TITLE' in row:
@@ -193,8 +79,20 @@ def parse_metadata(row):
         # Handle the error or log it
         return None  # or some default value
-def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
-    """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
     ensure_output_folder_exists(output_folder)
     output_list = []
@@ -212,7 +110,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
         return doc_sections, "Please load in at least one csv/Excel/parquet data file.", output_list
     if not text_column:
-        return None, "Please enter a column name to search"
     data_file_name = data_file_names[0]
@@ -246,7 +144,6 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
         df[text_column] = df_list
         clean_toc = time.perf_counter()
         clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
         print(clean_time_out)
@@ -285,26 +182,4 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
         output_list.append(out_doc_file_name)
         print("Documents saved to file.")
-    return doc_sections, "Finished preparing documents.", output_list
-def document_to_dataframe(documents):
-    '''
-    Convert an object in document format to pandas dataframe
-    '''
-    rows = []
-    for doc in documents:
-        # Convert Document to dictionary and extract metadata
-        doc_dict = doc.dict()
-        metadata = doc_dict.pop('metadata')
-        # Add the page_content and type to the metadata
-        metadata['page_content'] = doc_dict['page_content']
-        metadata['type'] = doc_dict['type']
-        # Add to the list of rows
-        rows.append(metadata)
-    # Create a DataFrame from the list of rows
-    df = pd.DataFrame(rows)
-    return df

 import time
 import ast
 import gzip
 import pandas as pd
 import gradio as gr
 import pickle
 from typing import Type, List, Literal
 from pydantic import BaseModel, Field
 # Creating an alias for pandas DataFrame using Type
 PandasDataFrame = Type[pd.DataFrame]
+PandasSeries = Type[pd.Series]
 class Document(BaseModel):
     """Class for storing a piece of text and associated metadata. Implementation adapted from Langchain code: https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py"""
     """
     type: Literal["Document"] = "Document"
+from search_funcs.helper_functions import get_file_path_end, ensure_output_folder_exists
 from search_funcs.bm25_functions import save_prepared_bm25_data, output_folder
 from search_funcs.clean_funcs import initial_clean
+def combine_metadata_columns(df:PandasDataFrame, cols:List[str]) -> PandasSeries:
+    '''
+    Construct a metadata column as a string version of a dictionary for later parsing.
+    Parameters:
+    - df (PandasDataFrame): Data frame of search data.
+	- cols (List[str]): List of column names that will be included in the output metadata column.
+	Returns:
+	- PandasSeries: A series containing the metadata elements combined into a dictionary format as a string.
+    '''
     df['metadata'] = '{'
     df['blank_column'] = ''
     return df['metadata']
+def clean_line_breaks(text:str):
+    '''Replace \n and \r\n with a space'''
     return text.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ')
 def parse_metadata(row):
+    '''
+    Parse a string instance of a dictionary into a Python object.
+    '''
     try:
         # Ensure the 'title' field is a string and clean line breaks
         #if 'TITLE' in row:
         # Handle the error or log it
         return None  # or some default value
+def csv_excel_text_to_docs(df:PandasDataFrame, in_file:List[str], text_column:str, clean:str = "No", return_intermediate_files:str = "No", progress=gr.Progress(track_tqdm=True)) -> tuple:
+    """Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata.
+    Parameters:
+    - df (PandasDataFrame): Data frame of search data.
+	- in_file (List[str]): List of input file names.
+	- text_column (str): The text column that will be searched.
+	- clean (str): Whether the text is cleaned before searching.
+	- return_intermediate_files (str): Whether intermediate processing files are saved to file.
+	- progress (gr.Progress, optional): The progress tracker for the operation.
+	Returns:
+	- tuple: A tuple containing data outputs in a Document class format, an output message, and a list of output file paths.
+    """
     ensure_output_folder_exists(output_folder)
     output_list = []
         return doc_sections, "Please load in at least one csv/Excel/parquet data file.", output_list
     if not text_column:
+        return None, "Please enter a column name to search", output_list
     data_file_name = data_file_names[0]
         df[text_column] = df_list
         clean_toc = time.perf_counter()
         clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
         print(clean_time_out)
         output_list.append(out_doc_file_name)
         print("Documents saved to file.")
+    return doc_sections, "Finished preparing documents.", output_list

search_funcs/spacy_search_funcs.py CHANGED Viewed

@@ -7,30 +7,19 @@ import gradio as gr
 import pandas as pd
 from typing import List, Type
 from datetime import datetime
-from search_funcs.helper_functions import create_highlighted_excel_wb, output_folder
 PandasDataFrame = Type[pd.DataFrame]
 today_rev = datetime.now().strftime("%Y%m%d")
-# Load the SpaCy model
-#os.system("python -m spacy download en_core_web_sm")
-try:
-	import en_core_web_sm
-	nlp = en_core_web_sm.load()
-	print("Successfully imported spaCy model")
-    #nlp = spacy.load("en_core_web_sm")
-    #print(nlp._path)
-except:
-	download("en_core_web_sm")
-	nlp = spacy.load("en_core_web_sm")
-	print("Successfully imported spaCy model")
 def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
     ''' Conduct fuzzy match on a list of data.'''
-    #print("df_list:", df_list)
     # Convert tokenised data back into a list of strings
     df_list = list(map(" ".join, tokenised_data))

 import pandas as pd
 from typing import List, Type
 from datetime import datetime
+from search_funcs.helper_functions import create_highlighted_excel_wb, output_folder, load_spacy_model
 PandasDataFrame = Type[pd.DataFrame]
 today_rev = datetime.now().strftime("%Y%m%d")
 def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
     ''' Conduct fuzzy match on a list of data.'''
+    # Load spaCy model
+    nlp = load_spacy_model()
     # Convert tokenised data back into a list of strings
     df_list = list(map(" ".join, tokenised_data))