Spaces:

seanpedrickcase
/

data_text_search

Running

App Files Files Community

seanpedrickcase commited on Jan 11

Commit

200480d

•

1 Parent(s): f2db299

Minor changes to file path for outputs, documentation, location of pyinstaller build dependencies

Browse files

Files changed (10) hide show

.gitignore +2 -1
README.md +4 -4
app.py +2 -2
hook-en_core_web_sm.py +0 -8
hook-gradio.py +0 -8
how_to_create_exe_dist.txt +2 -2
search_funcs/bm25_functions.py +43 -42
search_funcs/helper_functions.py +1 -1
search_funcs/semantic_functions.py +13 -13
search_funcs/semantic_ingest_functions.py +4 -4

.gitignore CHANGED Viewed

@@ -20,4 +20,5 @@ dist/*
 __pycache__/*
 db/*
 experiments/*
-model/*

 __pycache__/*
 db/*
 experiments/*
+model/*
+build_deps/*

README.md CHANGED Viewed

@@ -15,8 +15,8 @@ Search through long-form text fields in your tabular data. Either for exact, spe
 # Guide
 ## Keyword search
-1. Load in your tabular data file (.csv, .parquet, .xlsx - first sheet).
-2. Wait a few seconds for the file to upload, then in the dropdown menu below 'Enter the name of the text column...' choose the column from the data file that you want to search.
 3. Hit 'Load data'. The 'Load progress' text box will let you know when the file is ready.
 4. In the 'Enter your search term' area below this, type in the key words you want to find in your text. Note that if the term is not spelled exactly as it is found in the text, it will not be found!
 5. Hit search text. You may have to wait depending on the size of the data you are searching.
@@ -26,8 +26,8 @@ Search through long-form text fields in your tabular data. Either for exact, spe
 This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words.
-1. Load in your tabular data file (.csv, .parquet, .xlsx - first sheet).
-2. Wait a few seconds for the file to upload, then in the dropdown menu below 'Enter the name of the text column...' choose the column from the data file that you want to search.
 3. Hit 'Load data'. The 'Load progress' text box will let you know when the file is ready.
 4. In the 'Enter semantic search query here' area below this, type in the terms you would like to search for.
 5. Press 'Start semantic search'. You may have to wait depending on the size of the data you are searching.

 # Guide
 ## Keyword search
+1. Load in your tabular data file (.csv, .parquet, .xlsx - first sheet). If the 'Keyword search' folder has been prepared, select both of the .parquet files in this folder (both the file with and without 'tokenised' in the name) to load into the app.
+2. Wait for the file(s) to upload, then in the dropdown menu below 'Enter the name of the text column...' choose the column from the data file that you want to search.
 3. Hit 'Load data'. The 'Load progress' text box will let you know when the file is ready.
 4. In the 'Enter your search term' area below this, type in the key words you want to find in your text. Note that if the term is not spelled exactly as it is found in the text, it will not be found!
 5. Hit search text. You may have to wait depending on the size of the data you are searching.
 This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words.
+1. Load in your tabular data file (.csv, .parquet, .xlsx - first sheet). If the 'Semantic search' folder has been prepared, select both of the .files in this folder to load into the app. This should be a '..prepared_docs.pkl.gz' file, and a '...embeddings_compress.npz' or 'embeddings.npz' file.
+2. Wait for the file(s) to upload, then in the dropdown menu below 'Enter the name of the text column...' choose the column from the data file that you want to search. If the 'Semantic search' folder has been prepared, this field should be 'page_contents'.
 3. Hit 'Load data'. The 'Load progress' text box will let you know when the file is ready.
 4. In the 'Enter semantic search query here' area below this, type in the terms you would like to search for.
 5. Press 'Start semantic search'. You may have to wait depending on the size of the data you are searching.

app.py CHANGED Viewed

@@ -58,7 +58,7 @@ depends on factors such as the type of documents or queries. Information taken f
     """
     **Exact term keyword search**
-    1. Load in data file (ideally a file with '_cleaned' at the end of the name), with (optionally) the '...tokenised_data.parquet' in the same folder to save loading time. 2. Select the field in your data to search. Ideally this will have the suffix '_cleaned' to show that html tags have been removed. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the relevant box below and press Enter/click on 'Search text'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
     """)
         with gr.Row():
             current_source = gr.Textbox(label="Current data source(s)", value="None")
@@ -88,7 +88,7 @@ depends on factors such as the type of documents or queries. Information taken f
     """
     **Thematic/semantic search**
-    This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words. 1. Load in data file (ideally a file with '_cleaned' at the end of the name), with (optionally) the 'semantic_search_embeddings.npz' in the same folder to save loading time. 2. Select the field in your data to search. Ideally this will have the suffix '_cleaned' to show that html tags have been removed. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
     """)
         with gr.Row():
             current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")

     """
     **Exact term keyword search**
+    1. Load in data file (ideally a file with '_cleaned' at the end of the name), with (optionally) the '...tokenised.parquet' in the same folder to save loading time. 2. Select the field in your data to search. A field with the suffix '_cleaned' means that html tags have been removed. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the relevant box below and press Enter/click on 'Search text'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
     """)
         with gr.Row():
             current_source = gr.Textbox(label="Current data source(s)", value="None")
     """
     **Thematic/semantic search**
+    This search type enables you to search for broader themes (e.g. happiness, nature) and the search will pick out text passages that relate to these themes even if they don't contain the exact words. 1. Load in data file (ideally a file with '_cleaned' at the end of the name, a pkl.gz file), with (optionally) the 'embeddings... .npz' file in the same folder to save loading time. 2. Select the field in your data to search. If you loaded in a documents pkl.gz file, this will be 'page_contents'. 3. Wait for the data file to be prepared for search. 4. Enter the search term in the 'Enter semantic search query here' box below and press Enter/click on 'Start semantic search'. 4. Your search results will be saved in a csv file and will be presented in the 'File output' area below.
     """)
         with gr.Row():
             current_source_semantic = gr.Textbox(label="Current data source(s)", value="None")

hook-en_core_web_sm.py DELETED Viewed

@@ -1,8 +0,0 @@
-from PyInstaller.utils.hooks import collect_data_files
-hiddenimports = [
-    'en_core_web_sm'
-]
-# Use collect_data_files to find data files. Replace 'en_core_web_sm' with the correct package name if it's different.
-datas = collect_data_files('en_core_web_sm')

hook-gradio.py DELETED Viewed

@@ -1,8 +0,0 @@
-from PyInstaller.utils.hooks import collect_data_files
-hiddenimports = [
-    'gradio'
-]
-# Use collect_data_files to find data files. Replace 'gradio' with the correct package name if it's different.
-datas = collect_data_files('gradio')

how_to_create_exe_dist.txt CHANGED Viewed

@@ -17,10 +17,10 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
 8. In command line, cd to the folder that contains app.py. Then run the following:
 For one single file:
-python -m PyInstaller --additional-hooks-dir=. --hidden-import pyarrow.vendored.version --add-data="types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.1 app.py
 For a small exe with a folder of dependencies:
-python -m PyInstaller --additional-hooks-dir=. --hidden-import pyarrow.vendored.version --add-data="types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.1 app.py
 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').

 8. In command line, cd to the folder that contains app.py. Then run the following:
 For one single file:
+python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --onefile --clean --noconfirm --name DataSearchApp_0.1.1 app.py
 For a small exe with a folder of dependencies:
+python -m PyInstaller --additional-hooks-dir="build_deps\\" --hidden-import pyarrow.vendored.version --add-data="build_deps\\types.json;gradio_client" --add-data "model;model" --clean --noconfirm --name DataSearchApp_0.1.1 app.py
 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').

search_funcs/bm25_functions.py CHANGED Viewed

@@ -235,7 +235,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No",  retur
 	#print(file_list)
-	data_file_names = [string for string in file_list if "tokenised" not in string and "embeddings" not in string]
 	data_file_name = data_file_names[0]
@@ -246,7 +246,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No",  retur
 	## Load in pre-tokenised corpus if exists
 	tokenised_df = pd.DataFrame()
-	tokenised_file_names = [string for string in file_list if "tokenised" in string]
 	if tokenised_file_names:
 		tokenised_df = read_file(tokenised_file_names[0])
@@ -303,7 +303,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, clean="No",  retur
 		message = "Data loaded. Warning: dataset may be too short to get consistent search results."
 	if return_intermediate_files == "Yes":
-		tokenised_data_file_name = data_file_out_name_no_ext + "_" + "keyword_search_tokenised_data.parquet"
 		pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
 		return corpus, message, df, out_file_name, tokenised_data_file_name, data_file_out_name
@@ -374,53 +374,54 @@ def convert_bm25_query_to_tokens(free_text_query, clean="No"):
 def bm25_search(free_text_query, in_no_search_results, original_data, text_column, clean = "No", in_join_file = None, in_join_column = "", search_df_join_column = ""):
-    # Prepare query
-    if (clean == "Yes") | (text_column.endswith("_cleaned")):
-        token_query = convert_bm25_query_to_tokens(free_text_query, clean="Yes")
-    else:
-        token_query = convert_bm25_query_to_tokens(free_text_query, clean="No")
-    #print(token_query)
-    # Perform search
-    print("Searching")
-    results_index, results_text, results_scores = bm25.extract_documents_and_scores(token_query, bm25.corpus, n=in_no_search_results) #bm25.corpus #original_data[text_column]
-    if not results_index:
-        return "No search results found", None, token_query
-    print("Search complete")
-    # Prepare results and export
-    joined_texts = [' '.join(inner_list) for inner_list in results_text]
-    results_df = pd.DataFrame(data={"index": results_index,
-                                    "search_text": joined_texts,
-                                    "search_score_abs": results_scores})
-    results_df['search_score_abs'] = abs(round(results_df['search_score_abs'], 2))
-    results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")#.drop("index", axis=1)
-    # Join on additional files
-    if in_join_file:
-        join_filename = in_join_file.name
-        # Import data
-        join_df = read_file(join_filename)
-        join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
-        results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
-        # Duplicates dropped so as not to expand out dataframe
-        join_df = join_df.drop_duplicates(in_join_column)
-        results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1)
-    # Reorder results by score
-    results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
-    # Out file
-    results_df_name = "keyword_search_result_" + today_rev + ".csv"
-    results_df_out.to_csv(results_df_name, index= None)
-    results_first_text = results_df_out[text_column].iloc[0]
-    print("Returning results")
-    return results_first_text, results_df_name, token_query

 	#print(file_list)
+	data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
 	data_file_name = data_file_names[0]
 	## Load in pre-tokenised corpus if exists
 	tokenised_df = pd.DataFrame()
+	tokenised_file_names = [string.lower() for string in file_list if "tokenised" in string.lower()]
 	if tokenised_file_names:
 		tokenised_df = read_file(tokenised_file_names[0])
 		message = "Data loaded. Warning: dataset may be too short to get consistent search results."
 	if return_intermediate_files == "Yes":
+		tokenised_data_file_name = data_file_out_name_no_ext + "_" + "tokenised.parquet"
 		pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
 		return corpus, message, df, out_file_name, tokenised_data_file_name, data_file_out_name
 def bm25_search(free_text_query, in_no_search_results, original_data, text_column, clean = "No", in_join_file = None, in_join_column = "", search_df_join_column = ""):
+	# Prepare query
+	if (clean == "Yes") | (text_column.endswith("_cleaned")):
+		token_query = convert_bm25_query_to_tokens(free_text_query, clean="Yes")
+	else:
+		token_query = convert_bm25_query_to_tokens(free_text_query, clean="No")
+	#print(token_query)
+	# Perform search
+	print("Searching")
+	results_index, results_text, results_scores = bm25.extract_documents_and_scores(token_query, bm25.corpus, n=in_no_search_results) #bm25.corpus #original_data[text_column]
+	if not results_index:
+		return "No search results found", None, token_query
+	print("Search complete")
+	# Prepare results and export
+	joined_texts = [' '.join(inner_list) for inner_list in results_text]
+	results_df = pd.DataFrame(data={"index": results_index,
+									"search_text": joined_texts,
+									"search_score_abs": results_scores})
+	results_df['search_score_abs'] = abs(round(results_df['search_score_abs'], 2))
+	results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")#.drop("index", axis=1)
+	# Join on additional files
+	if in_join_file:
+		join_filename = in_join_file.name
+		# Import data
+		join_df = read_file(join_filename)
+		join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
+		results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
+		# Duplicates dropped so as not to expand out dataframe
+		join_df = join_df.drop_duplicates(in_join_column)
+		results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1)
+	# Reorder results by score
+	results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
+	# Out file
+	query_str_file = ("_").join(token_query)
+	results_df_name = "keyword_search_result_" + today_rev + "_" +  query_str_file + ".csv"
+	results_df_out.to_csv(results_df_name, index= None)
+	results_first_text = results_df_out[text_column].iloc[0]
+	print("Returning results")
+	return results_first_text, results_df_name, token_query

search_funcs/helper_functions.py CHANGED Viewed

@@ -97,7 +97,7 @@ def put_columns_in_df(in_file, in_bm25_column):
     #print(file_list)
-    data_file_names = [string for string in file_list if "tokenised" not in string and "embeddings" not in string]
     data_file_name = data_file_names[0]
     new_choices = []

     #print(file_list)
+    data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
     data_file_name = data_file_names[0]
     new_choices = []

search_funcs/semantic_functions.py CHANGED Viewed

@@ -96,10 +96,10 @@ def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "
     ## Load in pre-embedded file if exists
     file_list = [string.name for string in in_file]
-    #print(file_list)
-    embeddings_file_names = [string for string in file_list if "embedding" in string]
-    data_file_names = [string for string in file_list if "tokenised" not in string]
     data_file_name = data_file_names[0]
     data_file_name_no_ext = get_file_path_end(data_file_name)
@@ -110,7 +110,7 @@ def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "
         embeddings_out = np.load(embeddings_file_names[0])['arr_0']
         # If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
-        if "super_compress" in embeddings_file_names[0]:
             embeddings_out /= 100
         # print("embeddings loaded: ", embeddings_out)
@@ -125,8 +125,6 @@ def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "
         embeddings_out = embeddings.encode(sentences=page_contents, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina embeddings
         #embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
         #embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
         toc = time.perf_counter()
         time_out = f"The embedding took {toc - tic:0.1f} seconds"
@@ -135,10 +133,10 @@ def docs_to_jina_embed_np_array(docs_out, in_file, return_intermediate_files = "
         # If you want to save your files for next time
         if return_intermediate_files == "Yes":
             if embeddings_super_compress == "No":
-                semantic_search_file_name = data_file_name_no_ext + '_' + 'semantic_search_embeddings.npz'
                 np.savez_compressed(semantic_search_file_name, embeddings_out)
             else:
-                semantic_search_file_name = data_file_name_no_ext + '_' + 'semantic_search_embeddings_super_compress.npz'
                 embeddings_out_round = np.round(embeddings_out, 3)
                 embeddings_out_round *= 100 # Rounding not currently used
                 np.savez_compressed(semantic_search_file_name, embeddings_out_round)
@@ -231,7 +229,7 @@ def process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_c
     return results_df_out
-def jina_simple_retrieval(new_question_kworded:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
                            vec_score_cut_off:float, vec_weight:float, in_join_file = None, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress()): # ,vectorstore, embeddings
     # print("vectorstore loaded: ", vectorstore)
@@ -243,7 +241,7 @@ def jina_simple_retrieval(new_question_kworded:str, vectorstore, docs, orig_df_c
     embeddings = embeddings.to(device)
     # Encode the query using the sentence transformer and convert to a PyTorch tensor
-    query = embeddings.encode(new_question_kworded)
     query_tensor = tensor(query).to(device)
     if query_tensor.dim() == 1:
@@ -282,8 +280,10 @@ def jina_simple_retrieval(new_question_kworded:str, vectorstore, docs, orig_df_c
     # If nothing found, return error message
     if results_df_out.empty:
         return 'No result found!', None
-    results_df_name = "semantic_search_result_" + today_rev + ".csv"
     results_df_out.to_csv(results_df_name, index= None)
     results_first_text = results_df_out.iloc[0, 1]
@@ -394,10 +394,10 @@ def docs_to_chroma_save_deprecated(docs_out, embeddings = embeddings_model, prog
     return out_message, collection
-def chroma_retrieval_deprecated(new_question_kworded:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
                            vec_score_cut_off:float, vec_weight:float, in_join_file = None, in_join_column = None, search_df_join_column = None, embeddings = embeddings_model): # ,vectorstore, embeddings
-            query = embeddings.encode(new_question_kworded).tolist()
             docs = vectorstore.query(
             query_embeddings=query,

     ## Load in pre-embedded file if exists
     file_list = [string.name for string in in_file]
+    print(file_list)
+    embeddings_file_names = [string.lower() for string in file_list if "npz" in string.lower()]
+    data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
     data_file_name = data_file_names[0]
     data_file_name_no_ext = get_file_path_end(data_file_name)
         embeddings_out = np.load(embeddings_file_names[0])['arr_0']
         # If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
+        if "compress" in embeddings_file_names[0]:
             embeddings_out /= 100
         # print("embeddings loaded: ", embeddings_out)
         embeddings_out = embeddings.encode(sentences=page_contents, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina embeddings
         #embeddings_list = embeddings.encode(sentences=page_contents, normalize_embeddings=True).tolist() # For BGE embeddings
         #embeddings_list = embeddings.encode(sentences=page_contents).tolist() # For minilm
         toc = time.perf_counter()
         time_out = f"The embedding took {toc - tic:0.1f} seconds"
         # If you want to save your files for next time
         if return_intermediate_files == "Yes":
             if embeddings_super_compress == "No":
+                semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
                 np.savez_compressed(semantic_search_file_name, embeddings_out)
             else:
+                semantic_search_file_name = data_file_name_no_ext + '_' + 'embedding_compress.npz'
                 embeddings_out_round = np.round(embeddings_out, 3)
                 embeddings_out_round *= 100 # Rounding not currently used
                 np.savez_compressed(semantic_search_file_name, embeddings_out_round)
     return results_df_out
+def jina_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
                            vec_score_cut_off:float, vec_weight:float, in_join_file = None, in_join_column = None, search_df_join_column = None, device = torch_device, embeddings = embeddings_model, progress=gr.Progress()): # ,vectorstore, embeddings
     # print("vectorstore loaded: ", vectorstore)
     embeddings = embeddings.to(device)
     # Encode the query using the sentence transformer and convert to a PyTorch tensor
+    query = embeddings.encode(query_str)
     query_tensor = tensor(query).to(device)
     if query_tensor.dim() == 1:
     # If nothing found, return error message
     if results_df_out.empty:
         return 'No result found!', None
+    query_str_file = query_str.replace(" ", "_")
+    results_df_name = "semantic_search_result_" + today_rev + "_" +  query_str_file + ".csv"
     results_df_out.to_csv(results_df_name, index= None)
     results_first_text = results_df_out.iloc[0, 1]
     return out_message, collection
+def chroma_retrieval_deprecated(query_str:str, vectorstore, docs, orig_df_col:str, k_val:int, out_passages:int,
                            vec_score_cut_off:float, vec_weight:float, in_join_file = None, in_join_column = None, search_df_join_column = None, embeddings = embeddings_model): # ,vectorstore, embeddings
+            query = embeddings.encode(query_str).tolist()
             docs = vectorstore.query(
             query_embeddings=query,

search_funcs/semantic_ingest_functions.py CHANGED Viewed

@@ -130,7 +130,7 @@ def parse_csv_or_excel(file_path, data_state, text_column = "text"):
         #print(file_list)
-        data_file_names = [string for string in file_list if "tokenised" not in string and "embeddings" not in string]
         data_file_name = data_file_names[0]
@@ -299,7 +299,7 @@ def csv_excel_text_to_docs(df, in_file, text_column='text', clean = "No", return
     file_list = [string.name for string in in_file]
-    data_file_names = [string for string in file_list if "tokenised" not in string and "embeddings" not in string]
     data_file_name = data_file_names[0]
     # Check if file is a document format, and explode out as needed
@@ -312,7 +312,7 @@ def csv_excel_text_to_docs(df, in_file, text_column='text', clean = "No", return
         doc_sections = df
-        print(doc_sections[0])
         # Convert each element in the Series to a Document instance
         #doc_sections = section_series.apply(lambda x: Document(**x))
@@ -365,7 +365,7 @@ def csv_excel_text_to_docs(df, in_file, text_column='text', clean = "No", return
     if return_intermediate_files == "Yes":
         data_file_out_name_no_ext = get_file_path_end(data_file_name)
-        file_name = data_file_out_name_no_ext + "_cleaned"
         #print(doc_sections)
         #page_content_series_string = pd.Series(doc_sections).astype(str)
         #page_content_series_string = page_content_series_string.str.replace(" type='Document'", "").str.replace("' metadata=", "', 'metadata':").str.replace("page_content=", "{'page_content':")

         #print(file_list)
+        data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
         data_file_name = data_file_names[0]
     file_list = [string.name for string in in_file]
+    data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()]
     data_file_name = data_file_names[0]
     # Check if file is a document format, and explode out as needed
         doc_sections = df
+        #print(doc_sections[0])
         # Convert each element in the Series to a Document instance
         #doc_sections = section_series.apply(lambda x: Document(**x))
     if return_intermediate_files == "Yes":
         data_file_out_name_no_ext = get_file_path_end(data_file_name)
+        file_name = data_file_out_name_no_ext
         #print(doc_sections)
         #page_content_series_string = pd.Series(doc_sections).astype(str)
         #page_content_series_string = page_content_series_string.str.replace(" type='Document'", "").str.replace("' metadata=", "', 'metadata':").str.replace("page_content=", "{'page_content':")