Spaces:

seanpedrickcase
/

data_text_search

Sleeping

App Files Files Community

Sean-Case commited on Jan 30, 2024

Commit

3df8e40

1 Parent(s): 200480d

Fixed data input for semantic search. Allowed for docs to be loaded in directly for semantic search. 0.2.1

Browse files

Files changed (7) hide show

README.md +1 -1
app.py +14 -13
how_to_create_exe_dist.txt +2 -2
search_funcs/bm25_functions.py +66 -18
search_funcs/helper_functions.py +5 -6
search_funcs/semantic_functions.py +5 -5
search_funcs/semantic_ingest_functions.py +2 -2

README.md CHANGED Viewed

@@ -15,7 +15,7 @@ Search through long-form text fields in your tabular data. Either for exact, spe
 # Guide
 ## Keyword search
-1. Load in your tabular data file (.csv, .parquet, .xlsx - first sheet). If the 'Keyword search' folder has been prepared, select both of the .parquet files in this folder (both the file with and without 'tokenised' in the name) to load into the app.
 2. Wait for the file(s) to upload, then in the dropdown menu below 'Enter the name of the text column...' choose the column from the data file that you want to search.
 3. Hit 'Load data'. The 'Load progress' text box will let you know when the file is ready.
 4. In the 'Enter your search term' area below this, type in the key words you want to find in your text. Note that if the term is not spelled exactly as it is found in the text, it will not be found!

 # Guide
 ## Keyword search
+1. Load in your tabular data file (.csv, .parquet, .xlsx - first sheet). If the 'Keyword search' folder has been prepared, select both of the .files in this folder (both the data file and the file ending 'search_index.pkl.gz') to load into the app.
 2. Wait for the file(s) to upload, then in the dropdown menu below 'Enter the name of the text column...' choose the column from the data file that you want to search.
 3. Hit 'Load data'. The 'Load progress' text box will let you know when the file is ready.
 4. In the 'Enter your search term' area below this, type in the key words you want to find in your text. Note that if the term is not spelled exactly as it is found in the text, it will not be found!

app.py CHANGED Viewed

@@ -28,12 +28,13 @@ with block:
     out_passages = gr.State(9999)
     vec_weight = gr.State(1)
-    docs_keep_as_doc_state = gr.State()
-    doc_df_state = gr.State()
-    docs_keep_out_state = gr.State()
     corpus_state = gr.State()
-    data_state = gr.State(pd.DataFrame())
     in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
 presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
@@ -58,13 +59,13 @@ depends on factors such as the type of documents or queries. Information taken f
     """
     **Exact term keyword search**
-    1. Load in data file (ideally a file with '_cleaned' at the end of the name), with (optionally) the '...tokenised.parquet' in the same folder to save loading time. 2. Select the field in your data to search. A field with the suffix '_cleaned' means that html tags have been removed. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the relevant box below and press Enter/click on 'Search text'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
     """)
         with gr.Row():
             current_source = gr.Textbox(label="Current data source(s)", value="None")
         with gr.Accordion(label = "Load in data", open=True):
-            in_bm25_file = gr.File(label="Upload data for keyword search", file_count= 'multiple', file_types = ['.parquet', '.csv'])
             with gr.Row():
                 in_bm25_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
                 load_bm25_data_button = gr.Button(value="Load data")
@@ -148,22 +149,22 @@ depends on factors such as the type of documents or queries. Information taken f
     ### BM25 SEARCH ###
     # Update dropdowns upon initial file load
-    in_bm25_file.upload(put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column, data_state])
     in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file, in_join_column], outputs=[in_join_column])
     # Load in BM25 data
-    load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, data_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, data_state, output_file, output_file, current_source]).\
-    then(fn=prepare_bm25, inputs=[corpus_state, in_k1, in_b, in_alpha], outputs=[load_finished_message])#.\
     #then(fn=put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column])
     # BM25 search functions on click or enter
-    keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
-    keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
     ### SEMANTIC SEARCH ###
     # Load in a csv/excel file for semantic search
-    in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column, data_state])
-    load_semantic_data_button.click(parse_csv_or_excel, inputs=[in_semantic_file, data_state, in_semantic_column], outputs=[ingest_text, current_source_semantic, semantic_load_progress]).\
              then(csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
              then(docs_to_jina_embed_np_array, inputs=[ingest_docs, in_semantic_file, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])

     out_passages = gr.State(9999)
     vec_weight = gr.State(1)
+    #docs_keep_as_doc_state = gr.State()
+    #doc_df_state = gr.State()
+    #docs_keep_out_state = gr.State()
     corpus_state = gr.State()
+    keyword_data_state = gr.State(pd.DataFrame())
+    semantic_data_state = gr.State(pd.DataFrame())
     in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
 presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
     """
     **Exact term keyword search**
+    1. Load in data file (ideally a file with '_cleaned' at the end of the name), with (optionally) the '...search_index.pkl.gz' in the same folder to save loading time. 2. Select the field in your data to search. A field with the suffix '_cleaned' means that html tags have been removed. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the relevant box below and press Enter/click on 'Search text'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
     """)
         with gr.Row():
             current_source = gr.Textbox(label="Current data source(s)", value="None")
         with gr.Accordion(label = "Load in data", open=True):
+            in_bm25_file = gr.File(label="Upload data for keyword search", file_count= 'multiple', file_types =['.parquet', '.csv', '.pkl', '.pkl.gz'])
             with gr.Row():
                 in_bm25_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
                 load_bm25_data_button = gr.Button(value="Load data")
     ### BM25 SEARCH ###
     # Update dropdowns upon initial file load
+    in_bm25_file.upload(put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column, keyword_data_state])
     in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file, in_join_column], outputs=[in_join_column])
     # Load in BM25 data
+    load_bm25_data_button.click(fn=prepare_bm25_input_data, inputs=[in_bm25_file, in_bm25_column, keyword_data_state, in_clean_data, return_intermediate_files], outputs=[corpus_state, load_finished_message, keyword_data_state, output_file, output_file, current_source]).\
+    then(fn=prepare_bm25, inputs=[corpus_state, in_bm25_file, return_intermediate_files, in_k1, in_b, in_alpha], outputs=[load_finished_message, output_file])#.\
     #then(fn=put_columns_in_df, inputs=[in_bm25_file, in_bm25_column], outputs=[in_bm25_column, in_clean_data, search_df_join_column])
     # BM25 search functions on click or enter
+    keyword_search_button.click(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file], api_name="keyword")
+    keyword_query.submit(fn=bm25_search, inputs=[keyword_query, in_no_search_results, keyword_data_state, in_bm25_column, in_clean_data, in_join_file, in_join_column, search_df_join_column], outputs=[output_single_text, output_file])
     ### SEMANTIC SEARCH ###
     # Load in a csv/excel file for semantic search
+    in_semantic_file.upload(put_columns_in_df, inputs=[in_semantic_file, in_semantic_column], outputs=[in_semantic_column, in_clean_data, search_df_join_column, semantic_data_state])
+    load_semantic_data_button.click(parse_csv_or_excel, inputs=[in_semantic_file, semantic_data_state, in_semantic_column], outputs=[ingest_text, current_source_semantic, semantic_load_progress]).\
              then(csv_excel_text_to_docs, inputs=[ingest_text, in_semantic_file, in_semantic_column, in_clean_data, return_intermediate_files], outputs=[ingest_docs, semantic_load_progress]).\
              then(docs_to_jina_embed_np_array, inputs=[ingest_docs, in_semantic_file, return_intermediate_files, embedding_super_compress], outputs=[semantic_load_progress, vectorstore_state, semantic_output_file])

how_to_create_exe_dist.txt CHANGED Viewed

@@ -17,10 +17,10 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
 8. In command line, cd to the folder that contains app.py. Then run the following:
 For one single file:
-python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.1.1 app.py
 For a small exe with a folder of dependencies:
-python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.1.1 app.py
 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').

 8. In command line, cd to the folder that contains app.py. Then run the following:
 For one single file:
+python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.2.1 app.py
 For a small exe with a folder of dependencies:
+python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.2.1 app.py
 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').

search_funcs/bm25_functions.py CHANGED Viewed

@@ -3,8 +3,10 @@ import heapq
 import math
 import pickle
 import sys
 import time
 import pandas as pd
 from numpy import inf
 import gradio as gr
@@ -235,7 +237,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No",  retur
 	#print(file_list)
-	data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
 	data_file_name = data_file_names[0]
@@ -247,20 +249,24 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No",  retur
 	tokenised_df = pd.DataFrame()
 	tokenised_file_names = [string.lower() for string in file_list if "tokenised" in string.lower()]
 	if tokenised_file_names:
 		tokenised_df = read_file(tokenised_file_names[0])
-		#print("Tokenised df is: ", tokenised_df.head())
-	#df = pd.read_parquet(file_in.name)
-	df[text_column] = df[text_column].astype(str).str.lower()
 	if clean == "Yes":
 		clean_tic = time.perf_counter()
 		print("Starting data clean.")
-		df = df.drop_duplicates(text_column)
 		df_list = list(df[text_column])
 		df_list = initial_clean(df_list)
@@ -336,20 +342,62 @@ def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_col
 	return file_name, new_text_column
-def prepare_bm25(corpus, k1=1.5, b = 0.75, alpha=-5):
-    #bm25.save("saved_df_bm25")
-    #bm25 = BM25.load(re.sub(r'\.pkl$', '', file_in.name))
-    print("Preparing BM25 corpus")
-    global bm25
-    bm25 = BM25(corpus, k1=k1, b=b, alpha=alpha)
-    message = "Search parameters loaded."
-    print(message)
-    return message
 def convert_bm25_query_to_tokens(free_text_query, clean="No"):
     '''
@@ -418,8 +466,8 @@ def bm25_search(free_text_query, in_no_search_results, original_data, text_colum
 	# Out file
 	query_str_file = ("_").join(token_query)
-	results_df_name = "keyword_search_result_" + today_rev + "_" +  query_str_file + ".csv"
-	results_df_out.to_csv(results_df_name, index= None)
 	results_first_text = results_df_out[text_column].iloc[0]
 	print("Returning results")

 import math
 import pickle
 import sys
+import gzip
 import time
 import pandas as pd
+import numpy as np
 from numpy import inf
 import gradio as gr
 	#print(file_list)
+	data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
 	data_file_name = data_file_names[0]
 	tokenised_df = pd.DataFrame()
 	tokenised_file_names = [string.lower() for string in file_list if "tokenised" in string.lower()]
+	search_index_file_names = [string.lower() for string in file_list if "gz" in string.lower()]
+	df[text_column] = df[text_column].astype(str).str.lower()
+	if search_index_file_names:
+		corpus = list(df[text_column])
+		message = "Tokenisation skipped - loading search index from file."
+		print(message)
+		return corpus, message, df, None, None, None
 	if tokenised_file_names:
 		tokenised_df = read_file(tokenised_file_names[0])
 	if clean == "Yes":
 		clean_tic = time.perf_counter()
 		print("Starting data clean.")
+		#df = df.drop_duplicates(text_column)
 		df_list = list(df[text_column])
 		df_list = initial_clean(df_list)
 	return file_name, new_text_column
+def prepare_bm25(corpus, in_file, return_intermediate_files, k1=1.5, b = 0.75, alpha=-5):
+	#bm25.save("saved_df_bm25")
+	#bm25 = BM25.load(re.sub(r'\.pkl$', '', file_in.name))
+	file_list = [string.name for string in in_file]
+	#print(file_list)
+	# Get data file name
+	data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
+	data_file_name = data_file_names[0]
+	data_file_out_name = get_file_path_end_with_ext(data_file_name)
+	data_file_name_no_ext = get_file_path_end(data_file_name)
+	# Check if there is a search index file already
+	index_file_names = [string.lower() for string in file_list if "gz" in string.lower()]
+	if index_file_names:
+		index_file_name = index_file_names[0]
+		print(index_file_name)
+		bm25_load = read_file(index_file_name)
+		#index_file_out_name = get_file_path_end_with_ext(index_file_name)
+		#index_file_name_no_ext = get_file_path_end(index_file_name)
+	else:
+		print("Preparing BM25 corpus")
+		bm25_load = BM25(corpus, k1=k1, b=b, alpha=alpha)
+	global bm25
+	bm25 = bm25_load
+	if return_intermediate_files == "Yes":
+		bm25_search_file_name = data_file_name_no_ext + '_' + 'search_index.pkl.gz'
+		#np.savez_compressed(bm25_search_file_name, bm25)
+		with gzip.open(bm25_search_file_name, 'wb') as file:
+				pickle.dump(bm25, file)
+		print("Search index saved to file")
+		message = "Search parameters loaded."
+		return message, bm25_search_file_name
+	message = "Search parameters loaded."
+	print(message)
+	return message, None
 def convert_bm25_query_to_tokens(free_text_query, clean="No"):
     '''
 	# Out file
 	query_str_file = ("_").join(token_query)
+	results_df_name = "keyword_search_result_" + today_rev + "_" +  query_str_file + ".xlsx"
+	results_df_out.to_excel(results_df_name, index= None)
 	results_first_text = results_df_out[text_column].iloc[0]
 	print("Returning results")

search_funcs/helper_functions.py CHANGED Viewed

@@ -2,10 +2,6 @@ import os
 import re
 import pandas as pd
 import gradio as gr
-import os
-import shutil
 import os
 import shutil
 import getpass
@@ -35,7 +31,6 @@ def empty_folder(directory_path):
 def get_file_path_end(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
     basename = os.path.basename(file_path)
@@ -64,6 +59,8 @@ def detect_file_type(filename):
         return 'parquet'
     elif filename.endswith('.pkl.gz'):
         return 'pkl.gz'
     else:
         raise ValueError("Unsupported file type.")
@@ -82,7 +79,9 @@ def read_file(filename):
     elif file_type == 'pkl.gz':
         with gzip.open(filename, 'rb') as file:
             file = pickle.load(file)
-            #file = pd.read_pickle(filename)
     print("File load complete")

 import re
 import pandas as pd
 import gradio as gr
 import os
 import shutil
 import getpass
 def get_file_path_end(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
     basename = os.path.basename(file_path)
         return 'parquet'
     elif filename.endswith('.pkl.gz'):
         return 'pkl.gz'
+    #elif filename.endswith('.gz'):
+    #    return 'gz'
     else:
         raise ValueError("Unsupported file type.")
     elif file_type == 'pkl.gz':
         with gzip.open(filename, 'rb') as file:
             file = pickle.load(file)
+    #elif file_type == ".gz":
+    #    with gzip.open(filename, 'rb') as file:
+    #        file = pickle.load(file)
     print("File load complete")

search_funcs/semantic_functions.py CHANGED Viewed

@@ -96,10 +96,10 @@ def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "
     ## Load in pre-embedded file if exists
     file_list = [string.name for string in in_file]
-    print(file_list)
-    embeddings_file_names = [string.lower() for string in file_list if "npz" in string.lower()]
-    data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
     data_file_name = data_file_names[0]
     data_file_name_no_ext = get_file_path_end(data_file_name)
@@ -283,8 +283,8 @@ def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_v
     query_str_file = query_str.replace(" ", "_")
-    results_df_name = "semantic_search_result_" + today_rev + "_" +  query_str_file + ".csv"
-    results_df_out.to_csv(results_df_name, index= None)
     results_first_text = results_df_out.iloc[0, 1]
     return results_first_text, results_df_name

     ## Load in pre-embedded file if exists
     file_list = [string.name for string in in_file]
+    #print(file_list)
+    embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
+    data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
     data_file_name = data_file_names[0]
     data_file_name_no_ext = get_file_path_end(data_file_name)
     query_str_file = query_str.replace(" ", "_")
+    results_df_name = "semantic_search_result_" + today_rev + "_" +  query_str_file + ".xlsx"
+    results_df_out.to_excel(results_df_name, index= None)
     results_first_text = results_df_out.iloc[0, 1]
     return results_first_text, results_df_name

search_funcs/semantic_ingest_functions.py CHANGED Viewed

@@ -130,7 +130,7 @@ def parse_csv_or_excel(file_path, data_state, text_column = "text"):
         #print(file_list)
-        data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
         data_file_name = data_file_names[0]
@@ -329,7 +329,7 @@ def csv_excel_text_to_docs(df, in_file, text_column='text', clean = "No", return
         clean_tic = time.perf_counter()
         print("Starting data clean.")
-        df = df.drop_duplicates(text_column)
         df[text_column] = initial_clean(df[text_column])
         df_list = list(df[text_column])

         #print(file_list)
+        data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()]
         data_file_name = data_file_names[0]
         clean_tic = time.perf_counter()
         print("Starting data clean.")
+        #df = df.drop_duplicates(text_column)
         df[text_column] = initial_clean(df[text_column])
         df_list = list(df[text_column])