Sonnyjim commited on
Commit
9eeba1e
1 Parent(s): 32cf9fb

App should now check if embeddings are loaded before topic modelling. And will save only once.

Browse files
Files changed (3) hide show
  1. app.py +13 -10
  2. funcs/embeddings.py +41 -41
  3. funcs/helper_functions.py +4 -2
app.py CHANGED
@@ -80,8 +80,8 @@ hf_model_name = 'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1
80
  hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
81
 
82
 
83
- def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers):
84
-
85
  all_tic = time.perf_counter()
86
 
87
  output_list = []
@@ -144,7 +144,7 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
144
 
145
  umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
146
 
147
- embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
148
 
149
  vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
150
 
@@ -272,12 +272,16 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
272
  zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
273
  output_list.append(topic_model_save_name_zip)
274
 
 
275
  if return_intermediate_files == "Yes":
276
  print("Saving embeddings to file")
277
  if low_resource_mode == "Yes":
278
  embeddings_file_name = data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
279
  else:
280
- embeddings_file_name = data_file_name_no_ext + '_' + 'ai_embeddings.npz'
 
 
 
281
 
282
  np.savez_compressed(embeddings_file_name, embeddings_out)
283
 
@@ -297,15 +301,13 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
297
  time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
298
  print(time_out)
299
 
300
- return output_text, output_list, topics_vis
301
 
302
  all_toc = time.perf_counter()
303
  time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
304
  print(time_out)
305
 
306
- return output_text, output_list, None
307
-
308
- # , topic_model_save_name
309
 
310
  # ## Gradio app - extract topics
311
 
@@ -314,6 +316,7 @@ block = gr.Blocks(theme = gr.themes.Base())
314
  with block:
315
 
316
  data_state = gr.State(pd.DataFrame())
 
317
 
318
  gr.Markdown(
319
  """
@@ -359,10 +362,10 @@ with block:
359
  visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="No", choices=["Yes", "No"])
360
 
361
  # Update column names dropdown when file uploaded
362
- in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])
363
  in_colnames.change(dummy_function, in_colnames, None)
364
 
365
- topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers], outputs=[output_single_text, output_file, plot], api_name="topics")
366
 
367
  block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
368
 
 
80
  hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
81
 
82
 
83
+ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers, embeddings_out):
84
+
85
  all_tic = time.perf_counter()
86
 
87
  output_list = []
 
144
 
145
  umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
146
 
147
+ embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embeddings_out, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
148
 
149
  vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
150
 
 
272
  zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
273
  output_list.append(topic_model_save_name_zip)
274
 
275
+ # If you want to save your embedding files
276
  if return_intermediate_files == "Yes":
277
  print("Saving embeddings to file")
278
  if low_resource_mode == "Yes":
279
  embeddings_file_name = data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
280
  else:
281
+ if embeddings_super_compress == "No":
282
+ embeddings_file_name = data_file_name_no_ext + '_' + 'ai_embeddings.npz'
283
+ else:
284
+ embeddings_file_name = data_file_name_no_ext + '_' + 'ai_embedding_compress.npz'
285
 
286
  np.savez_compressed(embeddings_file_name, embeddings_out)
287
 
 
301
  time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
302
  print(time_out)
303
 
304
+ return output_text, output_list, topics_vis, embeddings_out
305
 
306
  all_toc = time.perf_counter()
307
  time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
308
  print(time_out)
309
 
310
+ return output_text, output_list, None, embeddings_out
 
 
311
 
312
  # ## Gradio app - extract topics
313
 
 
316
  with block:
317
 
318
  data_state = gr.State(pd.DataFrame())
319
+ embeddings_state = gr.State(np.array([]))
320
 
321
  gr.Markdown(
322
  """
 
362
  visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="No", choices=["Yes", "No"])
363
 
364
  # Update column names dropdown when file uploaded
365
+ in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state, embeddings_state])
366
  in_colnames.change(dummy_function, in_colnames, None)
367
 
368
+ topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers, embeddings_state], outputs=[output_single_text, output_file, plot, embeddings_state], api_name="topics")
369
 
370
  block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
371
 
funcs/embeddings.py CHANGED
@@ -13,60 +13,60 @@ if cuda.is_available():
13
  else:
14
  torch_device = "cpu"
15
 
16
- def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt, reduce_embeddings="Yes"):
17
 
18
- embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
 
 
19
 
20
- if embeddings_file_names:
21
- print("Loading embeddings from file.")
22
- embeddings_out = np.load(embeddings_file_names[0])['arr_0']
23
 
24
- # If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
25
- if "compress" in embeddings_file_names[0]:
26
- embeddings_out /= 100
27
 
28
- if not embeddings_file_names:
29
- tic = time.perf_counter()
30
- print("Starting to embed documents.")
31
 
32
- # Custom model
33
- # If on CPU, don't resort to embedding models
34
- if low_resource_mode_opt == "Yes":
35
- print("Creating simplified 'sparse' embeddings based on TfIDF")
36
 
37
- embedding_model = make_pipeline(
38
- TfidfVectorizer(),
39
- TruncatedSVD(100, random_state=random_seed)
40
- )
41
 
42
- # Fit the pipeline to the text data
43
- embedding_model.fit(docs)
 
 
44
 
45
- # Transform text data to embeddings
46
- embeddings_out = embedding_model.transform(docs)
47
 
48
- #embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32)
 
49
 
50
- elif low_resource_mode_opt == "No":
51
- print("Creating dense embeddings based on transformers model")
52
 
53
- embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
 
54
 
55
- toc = time.perf_counter()
56
- time_out = f"The embedding took {toc - tic:0.1f} seconds"
57
- print(time_out)
58
 
59
- # If you want to save your files for next time
60
- if return_intermediate_files == "Yes":
61
- print("Saving embeddings to file")
62
- if embeddings_super_compress == "No":
63
- semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
64
- np.savez_compressed(semantic_search_file_name, embeddings_out)
65
- else:
66
- semantic_search_file_name = data_file_name_no_ext + '_' + 'embedding_compress.npz'
67
- embeddings_out_round = np.round(embeddings_out, 3)
68
- embeddings_out_round *= 100 # Rounding not currently used
69
- np.savez_compressed(semantic_search_file_name, embeddings_out_round)
70
 
71
  # Pre-reduce embeddings for visualisation purposes
72
  if reduce_embeddings == "Yes":
 
13
  else:
14
  torch_device = "cpu"
15
 
16
+ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embeddings_out, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt, reduce_embeddings="Yes"):
17
 
18
+ # If no embeddings found, make or load in
19
+ if embeddings_out.size == 0:
20
+ print("Embeddings not found. Loading or generating new ones.")
21
 
22
+ embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
 
 
23
 
24
+ if embeddings_file_names:
25
+ print("Loading embeddings from file.")
26
+ embeddings_out = np.load(embeddings_file_names[0])['arr_0']
27
 
28
+ # If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
29
+ if "compress" in embeddings_file_names[0]:
30
+ embeddings_out /= 100
31
 
32
+ if not embeddings_file_names:
33
+ tic = time.perf_counter()
34
+ print("Starting to embed documents.")
 
35
 
36
+ # Custom model
37
+ # If on CPU, don't resort to embedding models
38
+ if low_resource_mode_opt == "Yes":
39
+ print("Creating simplified 'sparse' embeddings based on TfIDF")
40
 
41
+ embedding_model = make_pipeline(
42
+ TfidfVectorizer(),
43
+ TruncatedSVD(100, random_state=random_seed)
44
+ )
45
 
46
+ # Fit the pipeline to the text data
47
+ embedding_model.fit(docs)
48
 
49
+ # Transform text data to embeddings
50
+ embeddings_out = embedding_model.transform(docs)
51
 
52
+ #embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32)
 
53
 
54
+ elif low_resource_mode_opt == "No":
55
+ print("Creating dense embeddings based on transformers model")
56
 
57
+ embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
 
 
58
 
59
+ toc = time.perf_counter()
60
+ time_out = f"The embedding took {toc - tic:0.1f} seconds"
61
+ print(time_out)
62
+
63
+ # If the user has chosen to go with super compressed embedding files to save disk space
64
+ if embeddings_super_compress == "Yes":
65
+ embeddings_out = np.round(embeddings_out, 3)
66
+ embeddings_out *= 100
67
+
68
+ else:
69
+ print("Found pre-loaded embeddings.")
70
 
71
  # Pre-reduce embeddings for visualisation purposes
72
  if reduce_embeddings == "Yes":
funcs/helper_functions.py CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
5
  import gradio as gr
6
  import gzip
7
  import pickle
 
8
 
9
 
10
  def detect_file_type(filename):
@@ -62,8 +63,9 @@ def put_columns_in_df(in_file, in_bm25_column):
62
 
63
 
64
  concat_choices.extend(new_choices)
65
-
66
- return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df
 
67
 
68
  def get_file_path_end(file_path):
69
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
 
5
  import gradio as gr
6
  import gzip
7
  import pickle
8
+ import numpy as np
9
 
10
 
11
  def detect_file_type(filename):
 
63
 
64
 
65
  concat_choices.extend(new_choices)
66
+
67
+ #The np.array([]) at the end is for clearing the embedding state when a new file is loaded
68
+ return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, np.array([])
69
 
70
  def get_file_path_end(file_path):
71
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")