Sonnyjim commited on
Commit
be094ee
1 Parent(s): 0fe5421

Switched embeddings model to BGE Small 1.5 as Jina seemed unable to do zero shot topic modelling properly

Browse files
Files changed (2) hide show
  1. app.py +25 -17
  2. funcs/embeddings.py +2 -1
app.py CHANGED
@@ -4,7 +4,8 @@ from datetime import datetime
4
  import pandas as pd
5
  import numpy as np
6
  import time
7
- #from sklearn.cluster import KMeans
 
8
  from sklearn.feature_extraction.text import CountVectorizer
9
  from transformers import AutoModel, AutoTokenizer
10
  from transformers.pipelines import pipeline
@@ -80,9 +81,10 @@ def read_logs():
80
 
81
  # Pinning a Jina revision for security purposes: https://www.baseten.co/blog/pinning-ml-model-revisions-for-compatibility-and-security/
82
  # Save Jina model locally as described here: https://huggingface.co/jinaai/jina-embeddings-v2-base-en/discussions/29
83
- embeddings_name = "jinaai/jina-embeddings-v2-small-en"
84
  # local_embeddings_location = "model/jina/"
85
- revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
 
86
 
87
  # Model used for representing topics
88
  hf_model_name = 'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF' # 'second-state/stablelm-2-zephyr-1.6b-GGUF'
@@ -182,15 +184,17 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
182
  print("Low resource mode: ", low_resource_mode)
183
 
184
  if low_resource_mode == "No":
185
- print("Using high resource Jina transformer model")
186
- try:
187
- embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True,device_map="auto")
188
- except:
189
- embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto", use_auth_token=os.environ["HF_TOKEN"])
190
-
191
- tokenizer = AutoTokenizer.from_pretrained(embeddings_name)
192
 
193
- embedding_model_pipe = pipeline("feature-extraction", model=embedding_model, tokenizer=tokenizer)
 
 
 
 
 
 
194
 
195
  # UMAP model uses Bertopic defaults
196
  umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, random_state=random_seed)
@@ -216,7 +220,7 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
216
 
217
  if not candidate_topics:
218
 
219
- topic_model = BERTopic( embedding_model=embedding_model_pipe,
220
  vectorizer_model=vectoriser_model,
221
  umap_model=umap_model,
222
  min_topic_size = min_docs_slider,
@@ -237,7 +241,7 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
237
  zero_shot_topics = read_file(candidate_topics.name)
238
  zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
239
 
240
- topic_model = BERTopic( embedding_model=embedding_model_pipe,
241
  vectorizer_model=vectoriser_model,
242
  umap_model=umap_model,
243
  min_topic_size = min_docs_slider,
@@ -248,7 +252,11 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
248
 
249
  topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
250
 
251
- if not topics_text:
 
 
 
 
252
  return "No topics found.", data_file_name, None, embeddings_out, data_file_name_no_ext, topic_model, docs, label_list
253
 
254
  else:
@@ -264,9 +272,9 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
264
  embeddings_file_name = data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
265
  else:
266
  if embeddings_super_compress == "No":
267
- embeddings_file_name = data_file_name_no_ext + '_' + 'jina_embeddings.npz'
268
  else:
269
- embeddings_file_name = data_file_name_no_ext + '_' + 'jina_embeddings_compress.npz'
270
 
271
  np.savez_compressed(embeddings_file_name, embeddings_out)
272
 
@@ -464,7 +472,7 @@ with block:
464
  gr.Markdown(
465
  """
466
  # Topic modeller
467
- Generate topics from open text in tabular data. Upload a file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics, and another for labels in the visualisation. If you have an embeddings .npz file of the text made using the 'jina-embeddings-v2-small-en' model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available under the 'Options' tab.
468
 
469
  Suggested test dataset: https://huggingface.co/datasets/rag-datasets/mini_wikipedia/tree/main/data (passages.parquet)
470
  """)
 
4
  import pandas as pd
5
  import numpy as np
6
  import time
7
+
8
+ from sentence_transformers import SentenceTransformer
9
  from sklearn.feature_extraction.text import CountVectorizer
10
  from transformers import AutoModel, AutoTokenizer
11
  from transformers.pipelines import pipeline
 
81
 
82
  # Pinning a Jina revision for security purposes: https://www.baseten.co/blog/pinning-ml-model-revisions-for-compatibility-and-security/
83
  # Save Jina model locally as described here: https://huggingface.co/jinaai/jina-embeddings-v2-base-en/discussions/29
84
+ embeddings_name = "BAAI/bge-small-en-v1.5" #"jinaai/jina-embeddings-v2-base-en"
85
  # local_embeddings_location = "model/jina/"
86
+ #revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
87
+ #revision_choice = "69d43700292701b06c24f43b96560566a4e5ad1f"
88
 
89
  # Model used for representing topics
90
  hf_model_name = 'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF' # 'second-state/stablelm-2-zephyr-1.6b-GGUF'
 
184
  print("Low resource mode: ", low_resource_mode)
185
 
186
  if low_resource_mode == "No":
187
+ print("Using high resource BGE transformer model")
188
+
189
+
 
 
 
 
190
 
191
+ embedding_model = SentenceTransformer(embeddings_name)
192
+ #try:
193
+ #embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True,device_map="auto") # For Jina
194
+ #except:
195
+ # embedding_model = AutoModel.from_pretrained(embeddings_name)#, revision = revision_choice, trust_remote_code=True, device_map="auto", use_auth_token=os.environ["HF_TOKEN"])
196
+ #tokenizer = AutoTokenizer.from_pretrained(embeddings_name)
197
+ #embedding_model_pipe = pipeline("feature-extraction", model=embedding_model, tokenizer=tokenizer)
198
 
199
  # UMAP model uses Bertopic defaults
200
  umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, random_state=random_seed)
 
220
 
221
  if not candidate_topics:
222
 
223
+ topic_model = BERTopic( embedding_model=embedding_model, #embedding_model_pipe, #for Jina
224
  vectorizer_model=vectoriser_model,
225
  umap_model=umap_model,
226
  min_topic_size = min_docs_slider,
 
241
  zero_shot_topics = read_file(candidate_topics.name)
242
  zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
243
 
244
+ topic_model = BERTopic( embedding_model=embedding_model, #embedding_model_pipe, # for Jina
245
  vectorizer_model=vectoriser_model,
246
  umap_model=umap_model,
247
  min_topic_size = min_docs_slider,
 
252
 
253
  topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
254
 
255
+ print(topics_text)
256
+
257
+ if topics_text.size == 0:
258
+ # Handle the empty array case
259
+
260
  return "No topics found.", data_file_name, None, embeddings_out, data_file_name_no_ext, topic_model, docs, label_list
261
 
262
  else:
 
272
  embeddings_file_name = data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
273
  else:
274
  if embeddings_super_compress == "No":
275
+ embeddings_file_name = data_file_name_no_ext + '_' + 'bge_embeddings.npz'
276
  else:
277
+ embeddings_file_name = data_file_name_no_ext + '_' + 'bge_embeddings_compress.npz'
278
 
279
  np.savez_compressed(embeddings_file_name, embeddings_out)
280
 
 
472
  gr.Markdown(
473
  """
474
  # Topic modeller
475
+ Generate topics from open text in tabular data. Upload a file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics, and another for labels in the visualisation. If you have an embeddings .npz file of the text made using the 'BAAI/bge-small-en-v1.5' model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available under the 'Options' tab.
476
 
477
  Suggested test dataset: https://huggingface.co/datasets/rag-datasets/mini_wikipedia/tree/main/data (passages.parquet)
478
  """)
funcs/embeddings.py CHANGED
@@ -54,7 +54,8 @@ def make_or_load_embeddings(docs, file_list, embeddings_out, embedding_model, em
54
  elif low_resource_mode_opt == "No":
55
  print("Creating dense embeddings based on transformers model")
56
 
57
- embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
 
58
 
59
  toc = time.perf_counter()
60
  time_out = f"The embedding took {toc - tic:0.1f} seconds"
 
54
  elif low_resource_mode_opt == "No":
55
  print("Creating dense embeddings based on transformers model")
56
 
57
+ #embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
58
+ embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32) # For BGE
59
 
60
  toc = time.perf_counter()
61
  time_out = f"The embedding took {toc - tic:0.1f} seconds"