Sean-Case commited on
Commit
aa3df37
1 Parent(s): 0b7839c

Greatly improved low resource mode speed (at cost of potential quality)

Browse files
Files changed (3) hide show
  1. app.py +30 -21
  2. funcs/embeddings.py +7 -3
  3. funcs/representation_model.py +6 -3
app.py CHANGED
@@ -2,7 +2,8 @@ import gradio as gr
2
  from datetime import datetime
3
  import pandas as pd
4
  import numpy as np
5
- from sklearn.cluster import KMeans
 
6
  from sklearn.feature_extraction.text import CountVectorizer
7
  from transformers import AutoModel, AutoTokenizer
8
  from transformers.pipelines import pipeline
@@ -81,6 +82,8 @@ hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # '
81
 
82
  def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels, save_topic_model, visualise_topics):
83
 
 
 
84
  output_list = []
85
  file_list = [string.name for string in in_file]
86
 
@@ -122,18 +125,22 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
122
 
123
  embedding_model_pipe = pipeline("feature-extraction", model=embedding_model, tokenizer=tokenizer)
124
 
 
 
125
  elif low_resource_mode == "Yes":
126
- print("Choosing low resource TfIDF model")
127
  embedding_model_pipe = make_pipeline(
128
  TfidfVectorizer(),
129
  TruncatedSVD(100) # 100 # To be compatible with zero shot, this needs to be lower than number of suggested topics
130
  )
131
  embedding_model = embedding_model_pipe
132
 
133
- embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
134
 
135
 
136
-
 
 
137
 
138
  vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
139
 
@@ -141,19 +148,14 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
141
  from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
142
 
143
  print("Create LLM topic labels:", create_llm_topic_labels)
144
- representation_model = create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag)
145
-
146
-
147
-
148
-
149
 
150
 
151
  if not candidate_topics:
152
- #umap_model = UMAP(n_neighbors=15, n_components=5, random_state=random_seed)
153
-
154
  topic_model = BERTopic( embedding_model=embedding_model_pipe,
155
  vectorizer_model=vectoriser_model,
156
- #umap_model=umap_model,
157
  min_topic_size= min_docs_slider,
158
  nr_topics = max_topics_slider,
159
  representation_model=representation_model,
@@ -173,15 +175,9 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
173
  zero_shot_topics = read_file(candidate_topics.name)
174
  zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
175
 
176
- if len(zero_shot_topics_lower) < 15:
177
- umap_neighbours = len(zero_shot_topics_lower)
178
- else: umap_neighbours = 15
179
-
180
- #umap_model = UMAP(n_neighbors=umap_neighbours, n_components=5, random_state=random_seed)
181
-
182
  topic_model = BERTopic( embedding_model=embedding_model_pipe,
183
  vectorizer_model=vectoriser_model,
184
- #umap_model=umap_model,
185
  min_topic_size = min_docs_slider,
186
  nr_topics = max_topics_slider,
187
  zeroshot_topic_list = zero_shot_topics_lower,
@@ -252,11 +248,24 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
252
 
253
  if visualise_topics == "Yes":
254
  # Visualise the topics:
 
255
  print("Creating visualisation")
256
  topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
257
 
 
 
 
 
 
 
 
 
258
  return output_text, output_list, topics_vis
259
 
 
 
 
 
260
  return output_text, output_list, None
261
 
262
  # , topic_model_save_name
@@ -286,7 +295,7 @@ with block:
286
  candidate_topics = gr.File(label="Input topics from file (csv). File should have at least one column with a header and topic keywords in cells below. Topics will be taken from the first column of the file. Currently not compatible with low-resource embeddings.")
287
 
288
  with gr.Row():
289
- min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of documents needed to create topic")
290
  max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 3, step = 1, label = "Maximum number of topics")
291
 
292
  with gr.Row():
@@ -305,7 +314,7 @@ with block:
305
  return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="Yes", choices=["Yes", "No"])
306
  embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
307
  with gr.Row():
308
- low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings model.", value="No", choices=["Yes", "No"])
309
  create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
310
  save_topic_model = gr.Dropdown(label = "Save topic model to file.", value="Yes", choices=["Yes", "No"])
311
  visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="Yes", choices=["Yes", "No"])
 
2
  from datetime import datetime
3
  import pandas as pd
4
  import numpy as np
5
+ import time
6
+ #from sklearn.cluster import KMeans
7
  from sklearn.feature_extraction.text import CountVectorizer
8
  from transformers import AutoModel, AutoTokenizer
9
  from transformers.pipelines import pipeline
 
82
 
83
  def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels, save_topic_model, visualise_topics):
84
 
85
+ all_tic = time.perf_counter()
86
+
87
  output_list = []
88
  file_list = [string.name for string in in_file]
89
 
 
125
 
126
  embedding_model_pipe = pipeline("feature-extraction", model=embedding_model, tokenizer=tokenizer)
127
 
128
+ umap_model = UMAP(n_neighbors=15, n_components=5, random_state=random_seed)
129
+
130
  elif low_resource_mode == "Yes":
131
+ print("Choosing low resource TF-IDF model")
132
  embedding_model_pipe = make_pipeline(
133
  TfidfVectorizer(),
134
  TruncatedSVD(100) # 100 # To be compatible with zero shot, this needs to be lower than number of suggested topics
135
  )
136
  embedding_model = embedding_model_pipe
137
 
138
+ umap_model = TruncatedSVD(n_components=3, random_state=random_seed)
139
 
140
 
141
+
142
+ embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
143
+
144
 
145
  vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
146
 
 
148
  from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
149
 
150
  print("Create LLM topic labels:", create_llm_topic_labels)
151
+ representation_model = create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode)
 
 
 
 
152
 
153
 
154
  if not candidate_topics:
155
+
 
156
  topic_model = BERTopic( embedding_model=embedding_model_pipe,
157
  vectorizer_model=vectoriser_model,
158
+ umap_model=umap_model,
159
  min_topic_size= min_docs_slider,
160
  nr_topics = max_topics_slider,
161
  representation_model=representation_model,
 
175
  zero_shot_topics = read_file(candidate_topics.name)
176
  zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
177
 
 
 
 
 
 
 
178
  topic_model = BERTopic( embedding_model=embedding_model_pipe,
179
  vectorizer_model=vectoriser_model,
180
+ umap_model=umap_model,
181
  min_topic_size = min_docs_slider,
182
  nr_topics = max_topics_slider,
183
  zeroshot_topic_list = zero_shot_topics_lower,
 
248
 
249
  if visualise_topics == "Yes":
250
  # Visualise the topics:
251
+ vis_tic = time.perf_counter()
252
  print("Creating visualisation")
253
  topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
254
 
255
+ all_toc = time.perf_counter()
256
+ time_out = f"Creating visualisation took {all_toc - vis_tic:0.1f} seconds"
257
+ print(time_out)
258
+
259
+
260
+ time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
261
+ print(time_out)
262
+
263
  return output_text, output_list, topics_vis
264
 
265
+ all_toc = time.perf_counter()
266
+ time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
267
+ print(time_out)
268
+
269
  return output_text, output_list, None
270
 
271
  # , topic_model_save_name
 
295
  candidate_topics = gr.File(label="Input topics from file (csv). File should have at least one column with a header and topic keywords in cells below. Topics will be taken from the first column of the file. Currently not compatible with low-resource embeddings.")
296
 
297
  with gr.Row():
298
+ min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of documents per topic (use ~3 for low resource mode).")
299
  max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 3, step = 1, label = "Maximum number of topics")
300
 
301
  with gr.Row():
 
314
  return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="Yes", choices=["Yes", "No"])
315
  embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
316
  with gr.Row():
317
+ low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings and processing.", value="No", choices=["Yes", "No"])
318
  create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
319
  save_topic_model = gr.Dropdown(label = "Save topic model to file.", value="Yes", choices=["Yes", "No"])
320
  visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="Yes", choices=["Yes", "No"])
funcs/embeddings.py CHANGED
@@ -35,7 +35,7 @@ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_mo
35
  print("Creating simplified 'sparse' embeddings based on TfIDF")
36
  embedding_model = make_pipeline(
37
  TfidfVectorizer(),
38
- TruncatedSVD(100)
39
  )
40
 
41
  # Fit the pipeline to the text data
@@ -69,7 +69,11 @@ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_mo
69
 
70
  # Pre-reduce embeddings for visualisation purposes
71
  if reduce_embeddings == "Yes":
72
- reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=random_seed).fit_transform(embeddings_out)
73
- return embeddings_out, reduced_embeddings
 
 
 
 
74
 
75
  return embeddings_out, None
 
35
  print("Creating simplified 'sparse' embeddings based on TfIDF")
36
  embedding_model = make_pipeline(
37
  TfidfVectorizer(),
38
+ TruncatedSVD(100, random_state=random_seed)
39
  )
40
 
41
  # Fit the pipeline to the text data
 
69
 
70
  # Pre-reduce embeddings for visualisation purposes
71
  if reduce_embeddings == "Yes":
72
+ if low_resource_mode_opt == "No":
73
+ reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=random_seed).fit_transform(embeddings_out)
74
+ return embeddings_out, reduced_embeddings
75
+ else:
76
+ reduced_embeddings = TruncatedSVD(2, random_state=random_seed).fit_transform(embeddings_out)
77
+ return embeddings_out, reduced_embeddings
78
 
79
  return embeddings_out, None
funcs/representation_model.py CHANGED
@@ -121,7 +121,7 @@ keybert = KeyBERTInspired(random_state=random_seed)
121
  # MMR
122
  mmr = MaximalMarginalRelevance(diversity=0.3)
123
 
124
- def create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag):
125
 
126
  if create_llm_topic_labels == "Yes":
127
  # Use llama.cpp to load in model
@@ -142,8 +142,11 @@ def create_representation_model(create_llm_topic_labels, llm_config, hf_model_na
142
  }
143
 
144
  elif create_llm_topic_labels == "No":
145
- representation_model = {"KeyBERT": keybert}
146
- #representation_model = {"mmr": mmr}
 
 
 
147
 
148
  # Deprecated example using CTransformers. This package is not really used anymore
149
  #model = AutoModelForCausalLM.from_pretrained('NousResearch/Nous-Capybara-7B-V1.9-GGUF', model_type='mistral', model_file='Capybara-7B-V1.9-Q5_K_M.gguf', hf=True, **vars(llm_config))
 
121
  # MMR
122
  mmr = MaximalMarginalRelevance(diversity=0.3)
123
 
124
+ def create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode):
125
 
126
  if create_llm_topic_labels == "Yes":
127
  # Use llama.cpp to load in model
 
142
  }
143
 
144
  elif create_llm_topic_labels == "No":
145
+ if low_resource_mode == "Yes":
146
+ #representation_model = {"mmr": mmr}
147
+ representation_model = {"KeyBERT": keybert}
148
+ else:
149
+ representation_model = {"KeyBERT": keybert}
150
 
151
  # Deprecated example using CTransformers. This package is not really used anymore
152
  #model = AutoModelForCausalLM.from_pretrained('NousResearch/Nous-Capybara-7B-V1.9-GGUF', model_type='mistral', model_file='Capybara-7B-V1.9-Q5_K_M.gguf', hf=True, **vars(llm_config))