Spaces:
Running
Running
App should now check if embeddings are loaded before topic modelling. And will save only once.
Browse files- app.py +13 -10
- funcs/embeddings.py +41 -41
- funcs/helper_functions.py +4 -2
app.py
CHANGED
@@ -80,8 +80,8 @@ hf_model_name = 'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1
|
|
80 |
hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
|
81 |
|
82 |
|
83 |
-
def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers):
|
84 |
-
|
85 |
all_tic = time.perf_counter()
|
86 |
|
87 |
output_list = []
|
@@ -144,7 +144,7 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
144 |
|
145 |
umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
|
146 |
|
147 |
-
embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
|
148 |
|
149 |
vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
|
150 |
|
@@ -272,12 +272,16 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
272 |
zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
|
273 |
output_list.append(topic_model_save_name_zip)
|
274 |
|
|
|
275 |
if return_intermediate_files == "Yes":
|
276 |
print("Saving embeddings to file")
|
277 |
if low_resource_mode == "Yes":
|
278 |
embeddings_file_name = data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
|
279 |
else:
|
280 |
-
|
|
|
|
|
|
|
281 |
|
282 |
np.savez_compressed(embeddings_file_name, embeddings_out)
|
283 |
|
@@ -297,15 +301,13 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
297 |
time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
|
298 |
print(time_out)
|
299 |
|
300 |
-
return output_text, output_list, topics_vis
|
301 |
|
302 |
all_toc = time.perf_counter()
|
303 |
time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
|
304 |
print(time_out)
|
305 |
|
306 |
-
return output_text, output_list, None
|
307 |
-
|
308 |
-
# , topic_model_save_name
|
309 |
|
310 |
# ## Gradio app - extract topics
|
311 |
|
@@ -314,6 +316,7 @@ block = gr.Blocks(theme = gr.themes.Base())
|
|
314 |
with block:
|
315 |
|
316 |
data_state = gr.State(pd.DataFrame())
|
|
|
317 |
|
318 |
gr.Markdown(
|
319 |
"""
|
@@ -359,10 +362,10 @@ with block:
|
|
359 |
visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="No", choices=["Yes", "No"])
|
360 |
|
361 |
# Update column names dropdown when file uploaded
|
362 |
-
in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])
|
363 |
in_colnames.change(dummy_function, in_colnames, None)
|
364 |
|
365 |
-
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers], outputs=[output_single_text, output_file, plot], api_name="topics")
|
366 |
|
367 |
block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
|
368 |
|
|
|
80 |
hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
|
81 |
|
82 |
|
83 |
+
def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers, embeddings_out):
|
84 |
+
|
85 |
all_tic = time.perf_counter()
|
86 |
|
87 |
output_list = []
|
|
|
144 |
|
145 |
umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
|
146 |
|
147 |
+
embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embeddings_out, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
|
148 |
|
149 |
vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
|
150 |
|
|
|
272 |
zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
|
273 |
output_list.append(topic_model_save_name_zip)
|
274 |
|
275 |
+
# If you want to save your embedding files
|
276 |
if return_intermediate_files == "Yes":
|
277 |
print("Saving embeddings to file")
|
278 |
if low_resource_mode == "Yes":
|
279 |
embeddings_file_name = data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
|
280 |
else:
|
281 |
+
if embeddings_super_compress == "No":
|
282 |
+
embeddings_file_name = data_file_name_no_ext + '_' + 'ai_embeddings.npz'
|
283 |
+
else:
|
284 |
+
embeddings_file_name = data_file_name_no_ext + '_' + 'ai_embedding_compress.npz'
|
285 |
|
286 |
np.savez_compressed(embeddings_file_name, embeddings_out)
|
287 |
|
|
|
301 |
time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
|
302 |
print(time_out)
|
303 |
|
304 |
+
return output_text, output_list, topics_vis, embeddings_out
|
305 |
|
306 |
all_toc = time.perf_counter()
|
307 |
time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
|
308 |
print(time_out)
|
309 |
|
310 |
+
return output_text, output_list, None, embeddings_out
|
|
|
|
|
311 |
|
312 |
# ## Gradio app - extract topics
|
313 |
|
|
|
316 |
with block:
|
317 |
|
318 |
data_state = gr.State(pd.DataFrame())
|
319 |
+
embeddings_state = gr.State(np.array([]))
|
320 |
|
321 |
gr.Markdown(
|
322 |
"""
|
|
|
362 |
visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="No", choices=["Yes", "No"])
|
363 |
|
364 |
# Update column names dropdown when file uploaded
|
365 |
+
in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state, embeddings_state])
|
366 |
in_colnames.change(dummy_function, in_colnames, None)
|
367 |
|
368 |
+
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers, embeddings_state], outputs=[output_single_text, output_file, plot, embeddings_state], api_name="topics")
|
369 |
|
370 |
block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
|
371 |
|
funcs/embeddings.py
CHANGED
@@ -13,60 +13,60 @@ if cuda.is_available():
|
|
13 |
else:
|
14 |
torch_device = "cpu"
|
15 |
|
16 |
-
def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt, reduce_embeddings="Yes"):
|
17 |
|
18 |
-
|
|
|
|
|
19 |
|
20 |
-
|
21 |
-
print("Loading embeddings from file.")
|
22 |
-
embeddings_out = np.load(embeddings_file_names[0])['arr_0']
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
embeddings_out
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
print("Creating simplified 'sparse' embeddings based on TfIDF")
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
|
42 |
-
|
43 |
-
|
|
|
|
|
44 |
|
45 |
-
|
46 |
-
|
47 |
|
48 |
-
|
|
|
49 |
|
50 |
-
|
51 |
-
print("Creating dense embeddings based on transformers model")
|
52 |
|
53 |
-
|
|
|
54 |
|
55 |
-
|
56 |
-
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
57 |
-
print(time_out)
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
print(
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
|
71 |
# Pre-reduce embeddings for visualisation purposes
|
72 |
if reduce_embeddings == "Yes":
|
|
|
13 |
else:
|
14 |
torch_device = "cpu"
|
15 |
|
16 |
+
def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embeddings_out, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt, reduce_embeddings="Yes"):
|
17 |
|
18 |
+
# If no embeddings found, make or load in
|
19 |
+
if embeddings_out.size == 0:
|
20 |
+
print("Embeddings not found. Loading or generating new ones.")
|
21 |
|
22 |
+
embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
|
|
|
|
|
23 |
|
24 |
+
if embeddings_file_names:
|
25 |
+
print("Loading embeddings from file.")
|
26 |
+
embeddings_out = np.load(embeddings_file_names[0])['arr_0']
|
27 |
|
28 |
+
# If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
|
29 |
+
if "compress" in embeddings_file_names[0]:
|
30 |
+
embeddings_out /= 100
|
31 |
|
32 |
+
if not embeddings_file_names:
|
33 |
+
tic = time.perf_counter()
|
34 |
+
print("Starting to embed documents.")
|
|
|
35 |
|
36 |
+
# Custom model
|
37 |
+
# If on CPU, don't resort to embedding models
|
38 |
+
if low_resource_mode_opt == "Yes":
|
39 |
+
print("Creating simplified 'sparse' embeddings based on TfIDF")
|
40 |
|
41 |
+
embedding_model = make_pipeline(
|
42 |
+
TfidfVectorizer(),
|
43 |
+
TruncatedSVD(100, random_state=random_seed)
|
44 |
+
)
|
45 |
|
46 |
+
# Fit the pipeline to the text data
|
47 |
+
embedding_model.fit(docs)
|
48 |
|
49 |
+
# Transform text data to embeddings
|
50 |
+
embeddings_out = embedding_model.transform(docs)
|
51 |
|
52 |
+
#embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32)
|
|
|
53 |
|
54 |
+
elif low_resource_mode_opt == "No":
|
55 |
+
print("Creating dense embeddings based on transformers model")
|
56 |
|
57 |
+
embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
|
|
|
|
|
58 |
|
59 |
+
toc = time.perf_counter()
|
60 |
+
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
61 |
+
print(time_out)
|
62 |
+
|
63 |
+
# If the user has chosen to go with super compressed embedding files to save disk space
|
64 |
+
if embeddings_super_compress == "Yes":
|
65 |
+
embeddings_out = np.round(embeddings_out, 3)
|
66 |
+
embeddings_out *= 100
|
67 |
+
|
68 |
+
else:
|
69 |
+
print("Found pre-loaded embeddings.")
|
70 |
|
71 |
# Pre-reduce embeddings for visualisation purposes
|
72 |
if reduce_embeddings == "Yes":
|
funcs/helper_functions.py
CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
|
|
5 |
import gradio as gr
|
6 |
import gzip
|
7 |
import pickle
|
|
|
8 |
|
9 |
|
10 |
def detect_file_type(filename):
|
@@ -62,8 +63,9 @@ def put_columns_in_df(in_file, in_bm25_column):
|
|
62 |
|
63 |
|
64 |
concat_choices.extend(new_choices)
|
65 |
-
|
66 |
-
|
|
|
67 |
|
68 |
def get_file_path_end(file_path):
|
69 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
|
|
5 |
import gradio as gr
|
6 |
import gzip
|
7 |
import pickle
|
8 |
+
import numpy as np
|
9 |
|
10 |
|
11 |
def detect_file_type(filename):
|
|
|
63 |
|
64 |
|
65 |
concat_choices.extend(new_choices)
|
66 |
+
|
67 |
+
#The np.array([]) at the end is for clearing the embedding state when a new file is loaded
|
68 |
+
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, np.array([])
|
69 |
|
70 |
def get_file_path_end(file_path):
|
71 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|