Spaces:
Sleeping
Sleeping
seanpedrickcase
commited on
Commit
β’
2089141
1
Parent(s):
8466e45
Now checks for output folder before saving. Minor code cleaning
Browse files- Dockerfile +1 -1
- README.md +1 -1
- app.py +2 -1
- how_to_create_exe_dist.txt +2 -2
- search_funcs/aws_functions.py +1 -1
- search_funcs/bm25_functions.py +7 -1
- search_funcs/helper_functions.py +12 -0
- search_funcs/semantic_functions.py +7 -2
- search_funcs/semantic_ingest_functions.py +2 -1
Dockerfile
CHANGED
@@ -14,7 +14,7 @@ COPY requirements.txt .
|
|
14 |
RUN pip install --no-cache-dir -r requirements.txt
|
15 |
|
16 |
# Gradio needs to be installed after due to conflict with spacy in requirements
|
17 |
-
RUN pip install --no-cache-dir gradio==4.31.
|
18 |
|
19 |
# Download the BGE embedding model during the build process
|
20 |
RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
|
|
|
14 |
RUN pip install --no-cache-dir -r requirements.txt
|
15 |
|
16 |
# Gradio needs to be installed after due to conflict with spacy in requirements
|
17 |
+
RUN pip install --no-cache-dir gradio==4.31.4
|
18 |
|
19 |
# Download the BGE embedding model during the build process
|
20 |
RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
|
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: π
|
|
4 |
colorFrom: purple
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.31.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
|
|
4 |
colorFrom: purple
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.31.4
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
app.py
CHANGED
@@ -129,7 +129,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
129 |
with gr.Accordion(label="Data load / save options", open = True):
|
130 |
with gr.Row():
|
131 |
in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
|
132 |
-
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="
|
133 |
embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
|
134 |
#save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
|
135 |
with gr.Accordion(label="Keyword search options", open = False):
|
@@ -156,6 +156,7 @@ depends on factors such as the type of documents or queries. Information taken f
|
|
156 |
in_join_message = gr.Textbox(label="Join file load progress")
|
157 |
in_join_column = gr.Dropdown(label="Column to join in new data frame")
|
158 |
search_df_join_column = gr.Dropdown(label="Column to join in search data frame")
|
|
|
159 |
with gr.Accordion(label = "AWS data access", open = False):
|
160 |
aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
|
161 |
with gr.Row():
|
|
|
129 |
with gr.Accordion(label="Data load / save options", open = True):
|
130 |
with gr.Row():
|
131 |
in_clean_data = gr.Dropdown(label = "Clean text during load (remove html tags). For large files this may take some time!", value="No", choices=["Yes", "No"])
|
132 |
+
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
|
133 |
embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="Yes", choices=["Yes", "No"])
|
134 |
#save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
|
135 |
with gr.Accordion(label="Keyword search options", open = False):
|
|
|
156 |
in_join_message = gr.Textbox(label="Join file load progress")
|
157 |
in_join_column = gr.Dropdown(label="Column to join in new data frame")
|
158 |
search_df_join_column = gr.Dropdown(label="Column to join in search data frame")
|
159 |
+
|
160 |
with gr.Accordion(label = "AWS data access", open = False):
|
161 |
aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
|
162 |
with gr.Row():
|
how_to_create_exe_dist.txt
CHANGED
@@ -14,7 +14,7 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
|
|
14 |
|
15 |
9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
|
16 |
|
17 |
-
a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --onefile --name DataSearchApp_0.
|
18 |
|
19 |
b) Open the created spec file in Notepad. Add the following to the end of the Analysis section then save:
|
20 |
|
@@ -25,7 +25,7 @@ a = Analysis(
|
|
25 |
}
|
26 |
)
|
27 |
|
28 |
-
c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.
|
29 |
|
30 |
|
31 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
|
|
|
14 |
|
15 |
9.Run the following, assuming you want to make one single .exe file (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
|
16 |
|
17 |
+
a) In command line: pyi-makespec --additional-hooks-dir="build_deps\\" --collect-data=gradio_client --collect-data=gradio --hidden-import pyarrow.vendored.version --onefile --name DataSearchApp_0.4 app.py
|
18 |
|
19 |
b) Open the created spec file in Notepad. Add the following to the end of the Analysis section then save:
|
20 |
|
|
|
25 |
}
|
26 |
)
|
27 |
|
28 |
+
c) Back in command line, run this: pyinstaller --clean --noconfirm DataSearchApp_0.4.spec
|
29 |
|
30 |
|
31 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
|
search_funcs/aws_functions.py
CHANGED
@@ -7,8 +7,8 @@ import os
|
|
7 |
PandasDataFrame = Type[pd.DataFrame]
|
8 |
|
9 |
try:
|
|
|
10 |
bucket_name = os.environ['DATA_TEXT_SEARCH_BUCKET']
|
11 |
-
session = boto3.Session(profile_name="default")
|
12 |
except Exception as e:
|
13 |
bucket_name = ''
|
14 |
print(e)
|
|
|
7 |
PandasDataFrame = Type[pd.DataFrame]
|
8 |
|
9 |
try:
|
10 |
+
session = boto3.Session()
|
11 |
bucket_name = os.environ['DATA_TEXT_SEARCH_BUCKET']
|
|
|
12 |
except Exception as e:
|
13 |
bucket_name = ''
|
14 |
print(e)
|
search_funcs/bm25_functions.py
CHANGED
@@ -14,7 +14,7 @@ from datetime import datetime
|
|
14 |
today_rev = datetime.now().strftime("%Y%m%d")
|
15 |
|
16 |
from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
|
17 |
-
from search_funcs.helper_functions import get_file_path_end_with_ext, get_file_path_end, create_highlighted_excel_wb
|
18 |
|
19 |
# Load the SpaCy model
|
20 |
from spacy.cli.download import download
|
@@ -232,6 +232,7 @@ class BM25:
|
|
232 |
|
233 |
def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, clean="No", return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
|
234 |
#print(in_file)
|
|
|
235 |
|
236 |
if not in_file:
|
237 |
print("No input file found. Please load in at least one file.")
|
@@ -324,6 +325,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
|
|
324 |
message = "Data loaded. Warning: dataset may be too short to get consistent search results."
|
325 |
|
326 |
if return_intermediate_files == "Yes":
|
|
|
327 |
if clean == "Yes":
|
328 |
tokenised_data_file_name = "output/" + data_file_out_name_no_ext + "_cleaned_tokenised.parquet"
|
329 |
else:
|
@@ -337,6 +339,8 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
|
|
337 |
|
338 |
def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
|
339 |
|
|
|
|
|
340 |
# Check if the list and the dataframe have the same length
|
341 |
if len(prepared_text_list) != len(in_df):
|
342 |
raise ValueError("The length of 'prepared_text_list' and 'in_df' must match.")
|
@@ -543,6 +547,8 @@ def bm25_search(free_text_query, in_no_search_results, original_data, searched_d
|
|
543 |
results_df_out = results_df_out.sort_values(['search_score_abs', "search_text"], ascending=False)
|
544 |
|
545 |
# Out file
|
|
|
|
|
546 |
query_str_file = ("_").join(token_query)
|
547 |
results_df_name = "output/keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
|
548 |
|
|
|
14 |
today_rev = datetime.now().strftime("%Y%m%d")
|
15 |
|
16 |
from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
|
17 |
+
from search_funcs.helper_functions import get_file_path_end_with_ext, get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists
|
18 |
|
19 |
# Load the SpaCy model
|
20 |
from spacy.cli.download import download
|
|
|
232 |
|
233 |
def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, clean="No", return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
|
234 |
#print(in_file)
|
235 |
+
ensure_output_folder_exists()
|
236 |
|
237 |
if not in_file:
|
238 |
print("No input file found. Please load in at least one file.")
|
|
|
325 |
message = "Data loaded. Warning: dataset may be too short to get consistent search results."
|
326 |
|
327 |
if return_intermediate_files == "Yes":
|
328 |
+
|
329 |
if clean == "Yes":
|
330 |
tokenised_data_file_name = "output/" + data_file_out_name_no_ext + "_cleaned_tokenised.parquet"
|
331 |
else:
|
|
|
339 |
|
340 |
def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
|
341 |
|
342 |
+
ensure_output_folder_exists()
|
343 |
+
|
344 |
# Check if the list and the dataframe have the same length
|
345 |
if len(prepared_text_list) != len(in_df):
|
346 |
raise ValueError("The length of 'prepared_text_list' and 'in_df' must match.")
|
|
|
547 |
results_df_out = results_df_out.sort_values(['search_score_abs', "search_text"], ascending=False)
|
548 |
|
549 |
# Out file
|
550 |
+
ensure_output_folder_exists()
|
551 |
+
|
552 |
query_str_file = ("_").join(token_query)
|
553 |
results_df_name = "output/keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
|
554 |
|
search_funcs/helper_functions.py
CHANGED
@@ -58,6 +58,18 @@ def get_file_path_end_with_ext(file_path):
|
|
58 |
|
59 |
return filename_end
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
def detect_file_type(filename):
|
62 |
"""Detect the file type based on its extension."""
|
63 |
if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
|
|
|
58 |
|
59 |
return filename_end
|
60 |
|
61 |
+
def ensure_output_folder_exists():
|
62 |
+
"""Checks if the 'output/' folder exists, creates it if not."""
|
63 |
+
|
64 |
+
folder_name = "output/"
|
65 |
+
|
66 |
+
if not os.path.exists(folder_name):
|
67 |
+
# Create the folder if it doesn't exist
|
68 |
+
os.makedirs(folder_name)
|
69 |
+
print(f"Created the 'output/' folder.")
|
70 |
+
else:
|
71 |
+
print(f"The 'output/' folder already exists.")
|
72 |
+
|
73 |
def detect_file_type(filename):
|
74 |
"""Detect the file type based on its extension."""
|
75 |
if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
|
search_funcs/semantic_functions.py
CHANGED
@@ -25,7 +25,7 @@ else:
|
|
25 |
|
26 |
print("Device used is: ", torch_device)
|
27 |
|
28 |
-
from search_funcs.helper_functions import create_highlighted_excel_wb
|
29 |
|
30 |
PandasDataFrame = Type[pd.DataFrame]
|
31 |
|
@@ -67,8 +67,11 @@ else:
|
|
67 |
|
68 |
def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, output_file_state, clean, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings_model = embeddings_model, progress=gr.Progress(track_tqdm=True)):
|
69 |
'''
|
70 |
-
Takes a Langchain document class and saves it into a
|
71 |
'''
|
|
|
|
|
|
|
72 |
if not in_file:
|
73 |
out_message = "No input file found. Please load in at least one file."
|
74 |
print(out_message)
|
@@ -229,6 +232,8 @@ def bge_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_va
|
|
229 |
# print("vectorstore loaded: ", vectorstore)
|
230 |
progress(0, desc = "Conducting semantic search")
|
231 |
|
|
|
|
|
232 |
print("Searching")
|
233 |
|
234 |
# Convert it to a PyTorch tensor and transfer to GPU
|
|
|
25 |
|
26 |
print("Device used is: ", torch_device)
|
27 |
|
28 |
+
from search_funcs.helper_functions import create_highlighted_excel_wb, ensure_output_folder_exists
|
29 |
|
30 |
PandasDataFrame = Type[pd.DataFrame]
|
31 |
|
|
|
67 |
|
68 |
def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, output_file_state, clean, return_intermediate_files = "No", embeddings_super_compress = "No", embeddings_model = embeddings_model, progress=gr.Progress(track_tqdm=True)):
|
69 |
'''
|
70 |
+
Takes a Langchain document class and saves it into a Numpy array.
|
71 |
'''
|
72 |
+
|
73 |
+
ensure_output_folder_exists()
|
74 |
+
|
75 |
if not in_file:
|
76 |
out_message = "No input file found. Please load in at least one file."
|
77 |
print(out_message)
|
|
|
232 |
# print("vectorstore loaded: ", vectorstore)
|
233 |
progress(0, desc = "Conducting semantic search")
|
234 |
|
235 |
+
ensure_output_folder_exists()
|
236 |
+
|
237 |
print("Searching")
|
238 |
|
239 |
# Convert it to a PyTorch tensor and transfer to GPU
|
search_funcs/semantic_ingest_functions.py
CHANGED
@@ -31,7 +31,7 @@ chunk_size = 512
|
|
31 |
chunk_overlap = 0
|
32 |
start_index = True
|
33 |
|
34 |
-
from search_funcs.helper_functions import get_file_path_end_with_ext, detect_file_type, get_file_path_end
|
35 |
from search_funcs.bm25_functions import save_prepared_bm25_data
|
36 |
from search_funcs.clean_funcs import initial_clean
|
37 |
|
@@ -198,6 +198,7 @@ def parse_metadata(row):
|
|
198 |
def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
|
199 |
"""Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
|
200 |
|
|
|
201 |
output_list = []
|
202 |
|
203 |
if not in_file:
|
|
|
31 |
chunk_overlap = 0
|
32 |
start_index = True
|
33 |
|
34 |
+
from search_funcs.helper_functions import get_file_path_end_with_ext, detect_file_type, get_file_path_end, ensure_output_folder_exists
|
35 |
from search_funcs.bm25_functions import save_prepared_bm25_data
|
36 |
from search_funcs.clean_funcs import initial_clean
|
37 |
|
|
|
198 |
def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
|
199 |
"""Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
|
200 |
|
201 |
+
ensure_output_folder_exists()
|
202 |
output_list = []
|
203 |
|
204 |
if not in_file:
|