Spaces:

seanpedrickcase
/

data_text_search

Sleeping

App Files Files Community

seanpedrickcase commited on Jul 4

Commit

7f029b5

•

1 Parent(s): ea0dd40

Now accepts .zip file as inputs. Moved semantic search option bar. Minor API mode changes.

Browse files

Files changed (7) hide show

.dockerignore +1 -1
.gitignore +1 -1
app.py +9 -6
search_funcs/bm25_functions.py +3 -4
search_funcs/helper_functions.py +50 -7
search_funcs/semantic_functions.py +36 -61
search_funcs/spacy_search_funcs.py +4 -6

.dockerignore CHANGED Viewed

@@ -1,7 +1,6 @@
 *.csv
 *.pyc
 *.cpython-311.pyc
-*.cpython-310.pyc
 *.bat
 *.json
 *.xlsx
@@ -16,6 +15,7 @@
 *.pkl
 *.pkl.gz
 *.pem
 docs/*
 build/*
 dist/*

 *.csv
 *.pyc
 *.cpython-311.pyc
 *.bat
 *.json
 *.xlsx
 *.pkl
 *.pkl.gz
 *.pem
+*.zip
 docs/*
 build/*
 dist/*

.gitignore CHANGED Viewed

@@ -1,7 +1,6 @@
 *.csv
 *.pyc
 *.cpython-311.pyc
-*.cpython-310.pyc
 *.bat
 *.json
 *.xlsx
@@ -18,6 +17,7 @@
 *.pem
 *.json.out
 *.env
 docs/*
 build/*
 dist/*

 *.csv
 *.pyc
 *.cpython-311.pyc
 *.bat
 *.json
 *.xlsx
 *.pem
 *.json.out
 *.env
+*.zip
 docs/*
 build/*
 dist/*

app.py CHANGED Viewed

@@ -78,7 +78,7 @@ depends on factors such as the type of documents or queries. Information taken f
             current_source = gr.Textbox(label="Current data source(s)", value="None")
         with gr.Accordion(label = "Load in data", open=True):
-            in_bm25_file = gr.File(label="Upload data for keyword search", file_count= 'multiple', file_types =['.parquet', '.csv', '.pkl', '.pkl.gz'])
             with gr.Row():
                 in_bm25_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
                 load_bm25_data_button = gr.Button(value="Load data")
@@ -107,7 +107,7 @@ depends on factors such as the type of documents or queries. Information taken f
             current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
         with gr.Accordion("Load in data", open = True):
-            in_semantic_file = gr.File(label="Upload data file for semantic search", file_count= 'multiple', file_types = ['.parquet', '.csv', '.npy', '.npz', '.pkl', '.pkl.gz'])
             with gr.Row():
                 in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
@@ -115,6 +115,9 @@ depends on factors such as the type of documents or queries. Information taken f
             semantic_load_progress = gr.Textbox(label="Load progress")
         semantic_query = gr.Textbox(label="Enter semantic search query here")
         semantic_submit = gr.Button(value="Start semantic search", variant="primary")
@@ -146,8 +149,7 @@ depends on factors such as the type of documents or queries. Information taken f
                 in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
         with gr.Accordion(label="Fuzzy search options", open = False):
                 no_spelling_mistakes = gr.Slider(label = "Number of spelling mistakes allowed in fuzzy search", value = 1, minimum=1, maximum=4, step=1)
-        with gr.Accordion(label="Semantic search options", open = False):
-            semantic_min_distance = gr.Slider(label = "Minimum distance score for search result to be included", value = 0.6, minimum=0, maximum=0.95, step=0.01)
         with gr.Accordion(label = "Join on additional dataframes to results", open = False):
             in_join_file = gr.File(label="Upload your data to join here")
             in_join_message = gr.Textbox(label="Join file load progress")
@@ -180,7 +182,7 @@ depends on factors such as the type of documents or queries. Information taken f
     ### BM25 SEARCH ###
     # Update dropdowns upon initial file load
-    in_bm25_file.change(initial_data_load, inputs=[in_bm25_file], outputs=[in_bm25_column, search_df_join_column, prepared_keyword_data_state, orig_keyword_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, load_finished_message, current_source], api_name="initial_load")
     in_join_file.change(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
     # Load in BM25 data
@@ -197,7 +199,8 @@ depends on factors such as the type of documents or queries. Information taken f
     ### SEMANTIC SEARCH ###
     # Load in a csv/excel file for semantic search
-    in_semantic_file.change(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column,  search_df_join_column,  semantic_data_state, orig_semantic_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, semantic_load_progress, current_source_semantic])
     load_semantic_data_button.click(
         csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[semantic_input_document_format, semantic_load_progress, output_file_state], api_name="convert_texts_to_documents").\
         then(docs_to_bge_embed_np_array, inputs=[semantic_input_document_format, in_semantic_file, output_file_state, in_clean_data, embeddings_state, embeddings_model_name_state, embeddings_model_loc_state, return_intermediate_files, embeddings_compress], outputs=[semantic_load_progress, embeddings_state, semantic_output_file, output_file_state, embeddings_model_state], api_name="embed_documents")

             current_source = gr.Textbox(label="Current data source(s)", value="None")
         with gr.Accordion(label = "Load in data", open=True):
+            in_bm25_file = gr.File(label="Upload data for keyword search", file_count= 'multiple', file_types =['.parquet', '.csv', '.pkl', '.pkl.gz', '.zip'])
             with gr.Row():
                 in_bm25_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
                 load_bm25_data_button = gr.Button(value="Load data")
             current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")
         with gr.Accordion("Load in data", open = True):
+            in_semantic_file = gr.File(label="Upload data file for semantic search", file_count= 'multiple', file_types = ['.parquet', '.csv', '.npy', '.npz', '.pkl', '.pkl.gz', '.zip'])
             with gr.Row():
                 in_semantic_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
             semantic_load_progress = gr.Textbox(label="Load progress")
+        with gr.Accordion(label="Semantic search options", open = False):
+            semantic_min_distance = gr.Slider(label = "Minimum distance score for search result to be included", value = 0.2, minimum=0, maximum=0.95, step=0.01)
         semantic_query = gr.Textbox(label="Enter semantic search query here")
         semantic_submit = gr.Button(value="Start semantic search", variant="primary")
                 in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
         with gr.Accordion(label="Fuzzy search options", open = False):
                 no_spelling_mistakes = gr.Slider(label = "Number of spelling mistakes allowed in fuzzy search", value = 1, minimum=1, maximum=4, step=1)
         with gr.Accordion(label = "Join on additional dataframes to results", open = False):
             in_join_file = gr.File(label="Upload your data to join here")
             in_join_message = gr.Textbox(label="Join file load progress")
     ### BM25 SEARCH ###
     # Update dropdowns upon initial file load
+    in_bm25_file.upload(initial_data_load, inputs=[in_bm25_file], outputs=[in_bm25_column, search_df_join_column, prepared_keyword_data_state, orig_keyword_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, load_finished_message, current_source, in_bm25_file], api_name="keyword_data_load")
     in_join_file.change(put_columns_in_join_df, inputs=[in_join_file], outputs=[in_join_column, join_data_state, in_join_message])
     # Load in BM25 data
     ### SEMANTIC SEARCH ###
     # Load in a csv/excel file for semantic search
+    in_semantic_file.upload(initial_data_load, inputs=[in_semantic_file], outputs=[in_semantic_column,  search_df_join_column,  semantic_data_state, orig_semantic_data_state, bm25_search_index_state, embeddings_state, tokenised_prepared_keyword_data_state, semantic_load_progress, current_source_semantic, in_semantic_file], api_name="semantic_data_load")
     load_semantic_data_button.click(
         csv_excel_text_to_docs, inputs=[semantic_data_state, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[semantic_input_document_format, semantic_load_progress, output_file_state], api_name="convert_texts_to_documents").\
         then(docs_to_bge_embed_np_array, inputs=[semantic_input_document_format, in_semantic_file, output_file_state, in_clean_data, embeddings_state, embeddings_model_name_state, embeddings_model_loc_state, return_intermediate_files, embeddings_compress], outputs=[semantic_load_progress, embeddings_state, semantic_output_file, output_file_state, embeddings_model_state], api_name="embed_documents")

search_funcs/bm25_functions.py CHANGED Viewed

@@ -685,10 +685,9 @@ def bm25_search(
 	output_files.append(results_df_name)
-	csv_output_file = output_folder + "keyword_search_result_" + today_rev + "_" +  query_str_file + ".csv"
-	results_df_out.to_csv(csv_output_file, index=None)
-	output_files.append(csv_output_file)
 	print("Returning results")

 	output_files.append(results_df_name)
+	#csv_output_file = output_folder + "keyword_search_result_" + today_rev + "_" +  query_str_file + ".csv"
+	#results_df_out.to_csv(csv_output_file, index=None)
+	#output_files.append(csv_output_file)
 	print("Returning results")

search_funcs/helper_functions.py CHANGED Viewed

@@ -6,6 +6,7 @@ import os
 import shutil
 import getpass
 import gzip
 import pickle
 import numpy as np
@@ -177,7 +178,40 @@ def read_file(filename):
     return file
-def initial_data_load(in_file:List[str]):
     '''
     When file is loaded, update the column dropdown choices and relevant state variables
     '''
@@ -192,10 +226,15 @@ def initial_data_load(in_file:List[str]):
     file_list = [string.name for string in in_file]
-    #print(file_list)
     data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "search_index" not in string.lower()]
-    print(data_file_names)
     if not data_file_names:
         out_message = "Please load in at least one csv/Excel/parquet data file."
@@ -204,9 +243,10 @@ def initial_data_load(in_file:List[str]):
     # This if you have loaded in a documents object for the semantic search
     if "pkl" in data_file_names[0]:
         df = read_file(data_file_names[0])
         new_choices = list(df[0].metadata.keys()) #["Documents"] #["page_contents"] +
-        current_source = get_file_path_end_with_ext(data_file_names[0])
     # This if you have loaded in a csv/parquets/xlsx
     else:
@@ -231,11 +271,14 @@ def initial_data_load(in_file:List[str]):
     concat_choices.extend(new_choices)
     # Check if there is a search index file already
-    index_file_names = [string for string in file_list if "gz" in string.lower()]
     if index_file_names:
         index_file_name = index_file_names[0]
         index_load = read_file(index_file_name)
     embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
@@ -254,10 +297,10 @@ def initial_data_load(in_file:List[str]):
     if tokenised_file_names:
         tokenised_load = read_file(tokenised_file_names[0])
-    out_message = "Initial data check successful. Next, choose a data column to search in the drop down above, then click 'Load data'"
     print(out_message)
-    return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, df, index_load, embed_load, tokenised_load, out_message, current_source
 def put_columns_in_join_df(in_file:str):
     '''

 import shutil
 import getpass
 import gzip
+import zipfile
 import pickle
 import numpy as np
     return file
+def process_zip_files(file_list, progress=gr.Progress(track_tqdm=True)):
+    """
+    Processes a list of file names, unzipping any ZIP files found
+    and adding the extracted file names to the list.
+    Args:
+        file_list: A list of file names (strings).
+    """
+    progress(0.1, desc="Unzipping zip files")
+    i = 0
+    while i < len(file_list):  # Use 'while' for dynamic list changes
+        file_path = file_list[i]
+        if file_path.endswith(".zip"):
+            try:
+                zip_dir = os.path.dirname(file_path) or "."  # Get zip file's directory or use current if none
+                with zipfile.ZipFile(file_path, 'r') as zip_ref:
+                    zip_ref.extractall(zip_dir)  # Extract to zip's directory
+                    #print("List of files in zip:", zip_ref.namelist())
+                    extracted_files = [os.path.join(zip_dir, name) for name in zip_ref.namelist()]
+                    file_list.extend(extracted_files)
+            except zipfile.BadZipFile:
+                print(f"Warning: '{file_path}' is not a valid zip file.")
+        i += 1
+    file_list = [file for file in file_list if not file.endswith(".zip")]
+    print("file_list after files in zip extracted:", file_list)
+    return file_list
+def initial_data_load(in_file:List[str], progress = gr.Progress(track_tqdm=True)):
     '''
     When file is loaded, update the column dropdown choices and relevant state variables
     '''
     file_list = [string.name for string in in_file]
+    # If a zip file is loaded, unzip it and add the file names to the file_list
+    file_list = process_zip_files(file_list)
+    #print("File_list that makes it to main data load function:", file_list)
+    progress(0.3, desc="Loading in data files")
     data_file_names = [string for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower() and "search_index" not in string.lower()]
+    print("Data file names:", data_file_names)
     if not data_file_names:
         out_message = "Please load in at least one csv/Excel/parquet data file."
     # This if you have loaded in a documents object for the semantic search
     if "pkl" in data_file_names[0]:
+        print("Document object for semantic search:", data_file_names[0])
         df = read_file(data_file_names[0])
         new_choices = list(df[0].metadata.keys()) #["Documents"] #["page_contents"] +
+        current_source = get_file_path_end_with_ext(data_file_names[0])
     # This if you have loaded in a csv/parquets/xlsx
     else:
     concat_choices.extend(new_choices)
+    progress(0.6, desc="Loading in embedding/search index files")
     # Check if there is a search index file already
+    index_file_names = [string for string in file_list if ".gz" in string.lower()]
     if index_file_names:
         index_file_name = index_file_names[0]
+        print("Search index file name found:", index_file_name)
         index_load = read_file(index_file_name)
     embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]
     if tokenised_file_names:
         tokenised_load = read_file(tokenised_file_names[0])
+    out_message = "Initial data load successful. Next, choose a data column to search in the drop down above, then click 'Load data'"
     print(out_message)
+    return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, df, index_load, embed_load, tokenised_load, out_message, current_source, file_list
 def put_columns_in_join_df(in_file:str):
     '''

search_funcs/semantic_functions.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import os
 import time
 import pandas as pd
 from typing import Type
@@ -116,20 +115,15 @@ def docs_to_bge_embed_np_array(
         if "bge" in embeddings_model_name:
             print("Embedding with BGE model")
-            if embeddings_compress == "No":
-                print("Embedding with full fp32 precision")
-                embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, normalize_embeddings=True)
-            else:
-                print("Embedding with int8 precision")
-                embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, normalize_embeddings=True, precision="int8")
         else:
             print("Embedding with MiniLM-L6-v2 model")
-            if embeddings_compress == "No":
-                print("Embedding with full fp32 precision")
-                embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size)
-            else:
-                print("Embedding with int8 precision")
-                embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, precision="int8")
         toc = time.perf_counter()
         time_out = f"The embedding took {toc - tic:0.1f} seconds"
@@ -288,60 +282,43 @@ def bge_semantic_search(
     # Encode the query using the sentence transformer and convert to a PyTorch tensor
     if "bge" in embeddings_model_name:
-        if embeddings_compress == "Yes":
-            query_fp32 = embeddings_model.encode(query_str, normalize_embeddings=True)
-            #query = query_fp32
-            query = quantize_embeddings(
-            query_fp32,
-            precision="int8",
-            calibration_embeddings=embeddings)
-        else:
-            query = embeddings_model.encode(query_str, normalize_embeddings=True)
-        # Get cosine similarities
-        cosine_similarities = query @ embeddings.T
-        # Sentence transformers method, not used:
-        #cosine_similarities = query @ embeddings.T
-        #cosine_similarities = embeddings_model.similarity(query, embeddings)
-        # Flatten the tensor to a 1D array
-        #cosine_similarities = cosine_similarities.flatten()
     else:
-        print("Comparing similarity using Minilm-L6-v2")
-        if embeddings_compress == "Yes":
-            query_fp32 = embeddings_model.encode(query_str, normalize_embeddings=True)
-            #query = query_fp32
-            query = quantize_embeddings(
-            query_fp32,
-            precision="int8",
-            calibration_embeddings=embeddings)
-        else:
-            query = embeddings_model.encode(query_str, normalize_embeddings=True)
-        #cosine_similarities = embeddings_model.cosine_similarity(query, embeddings)
-        print("query:", query_fp32)
-        print("embeddings:", embeddings)
-        embeddings_norm = np.linalg.norm(embeddings, axis=1)
-        embeddings_norm = np.linalg.norm(embeddings, axis=1, keepdims=True)  # Keep dims to allow broadcasting
-        normalized_embeddings = embeddings / embeddings_norm
-        print("normalized_embeddings:", normalized_embeddings)
-        expanded_query_fp32 = np.expand_dims(query_fp32, axis=0)
-        cosine_similarities = (expanded_query_fp32 @ normalized_embeddings.T)
-        print("Initial cosine similarities:", cosine_similarities)
-    # Flatten the tensor to a 1D array
-    cosine_similarities = cosine_similarities.flatten()
     # Create a Pandas Series
     cosine_similarities_series = pd.Series(cosine_similarities)
@@ -379,14 +356,12 @@ def bge_semantic_search(
     #results_df_out.to_excel(results_df_name, index= None)
     results_first_text = results_df_out.iloc[0, 1]
     output_files.append(results_df_name)
-    csv_output_file = output_folder + "semantic_search_result_" + today_rev + "_" +  query_str_file + ".csv"
-    results_df_out.to_csv(csv_output_file, index=None)
-    output_files.append(csv_output_file)
     print("Returning results")
-    return results_first_text, results_df_name

 import time
 import pandas as pd
 from typing import Type
         if "bge" in embeddings_model_name:
             print("Embedding with BGE model")
         else:
             print("Embedding with MiniLM-L6-v2 model")
+        if embeddings_compress == "No":
+            print("Embedding with full fp32 precision")
+            embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size)
+        else:
+            print("Embedding with int8 precision")
+            embeddings_out = embeddings_model.encode(sentences=page_contents, show_progress_bar = True, batch_size = batch_size, precision="int8")
         toc = time.perf_counter()
         time_out = f"The embedding took {toc - tic:0.1f} seconds"
     # Encode the query using the sentence transformer and convert to a PyTorch tensor
     if "bge" in embeddings_model_name:
+        print("Comparing similarity using BGE model")
+    else:
+        print("Comparing similarity using MiniLM-L6-v2 model")
+    if embeddings_compress == "Yes":
+        query_fp32 = embeddings_model.encode(query_str)
+        # Using a query as int8 doesn't actually seem to work
+        # query_int8 = quantize_embeddings(
+        #     query_fp32, precision="int8", calibration_embeddings=embeddings
+        # )
     else:
+        query_fp32 = embeddings_model.encode(query_str)
+    #print("query:", query_fp32)
+    #print("embeddings:", embeddings)
+    # Normalise embeddings
+    query = query_fp32.astype('float32')
+    query_norm = np.linalg.norm(query)
+    normalized_query = query / query_norm
+    embeddings = embeddings.astype('float32')
+    embeddings_norm = np.linalg.norm(embeddings, axis=1, keepdims=True)  # Keep dims to allow broadcasting
+    normalized_embeddings = embeddings / embeddings_norm
+    #print("normalized_query:", normalized_query)
+    #print("normalized_embeddings:", normalized_embeddings)
+    cosine_similarities = (normalized_query @ normalized_embeddings.T)
+    #print("Initial cosine similarities:", cosine_similarities)
     # Create a Pandas Series
     cosine_similarities_series = pd.Series(cosine_similarities)
     #results_df_out.to_excel(results_df_name, index= None)
     results_first_text = results_df_out.iloc[0, 1]
     output_files.append(results_df_name)
+    #csv_output_file = output_folder + "semantic_search_result_" + today_rev + "_" +  query_str_file + ".csv"
+    #results_df_out.to_csv(csv_output_file, index=None)
+    #output_files.append(csv_output_file)
     print("Returning results")
+    return results_first_text, output_files

search_funcs/spacy_search_funcs.py CHANGED Viewed

@@ -1,7 +1,3 @@
-import spacy
-spacy.prefer_gpu()
-from spacy.cli.download import download
-from spacy.matcher import Matcher
 import numpy as np
 import gradio as gr
 import pandas as pd
@@ -13,11 +9,13 @@ PandasDataFrame = Type[pd.DataFrame]
 today_rev = datetime.now().strftime("%Y%m%d")
 def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
     ''' Conduct fuzzy match on a list of data.'''
     # Load spaCy model
     nlp = load_spacy_model()

 import numpy as np
 import gradio as gr
 import pandas as pd
 today_rev = datetime.now().strftime("%Y%m%d")
 def spacy_fuzzy_search(string_query:str, tokenised_data: List[List[str]], original_data: PandasDataFrame, text_column:str, in_join_file: PandasDataFrame, search_df_join_column:str, in_join_column:str, no_spelling_mistakes:int = 1, progress=gr.Progress(track_tqdm=True)):
     ''' Conduct fuzzy match on a list of data.'''
+    import spacy
+    spacy.prefer_gpu()
+    from spacy.matcher import Matcher
     # Load spaCy model
     nlp = load_spacy_model()