Spaces:
Running
Running
Switched embeddings model to BGE Small 1.5 as Jina seemed unable to do zero shot topic modelling properly
Browse files- app.py +25 -17
- funcs/embeddings.py +2 -1
app.py
CHANGED
@@ -4,7 +4,8 @@ from datetime import datetime
|
|
4 |
import pandas as pd
|
5 |
import numpy as np
|
6 |
import time
|
7 |
-
|
|
|
8 |
from sklearn.feature_extraction.text import CountVectorizer
|
9 |
from transformers import AutoModel, AutoTokenizer
|
10 |
from transformers.pipelines import pipeline
|
@@ -80,9 +81,10 @@ def read_logs():
|
|
80 |
|
81 |
# Pinning a Jina revision for security purposes: https://www.baseten.co/blog/pinning-ml-model-revisions-for-compatibility-and-security/
|
82 |
# Save Jina model locally as described here: https://huggingface.co/jinaai/jina-embeddings-v2-base-en/discussions/29
|
83 |
-
embeddings_name = "jinaai/jina-embeddings-v2-
|
84 |
# local_embeddings_location = "model/jina/"
|
85 |
-
revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
|
|
|
86 |
|
87 |
# Model used for representing topics
|
88 |
hf_model_name = 'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF' # 'second-state/stablelm-2-zephyr-1.6b-GGUF'
|
@@ -182,15 +184,17 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
182 |
print("Low resource mode: ", low_resource_mode)
|
183 |
|
184 |
if low_resource_mode == "No":
|
185 |
-
print("Using high resource
|
186 |
-
|
187 |
-
|
188 |
-
except:
|
189 |
-
embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto", use_auth_token=os.environ["HF_TOKEN"])
|
190 |
-
|
191 |
-
tokenizer = AutoTokenizer.from_pretrained(embeddings_name)
|
192 |
|
193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
|
195 |
# UMAP model uses Bertopic defaults
|
196 |
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, random_state=random_seed)
|
@@ -216,7 +220,7 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
216 |
|
217 |
if not candidate_topics:
|
218 |
|
219 |
-
topic_model = BERTopic( embedding_model=embedding_model_pipe,
|
220 |
vectorizer_model=vectoriser_model,
|
221 |
umap_model=umap_model,
|
222 |
min_topic_size = min_docs_slider,
|
@@ -237,7 +241,7 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
237 |
zero_shot_topics = read_file(candidate_topics.name)
|
238 |
zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
|
239 |
|
240 |
-
topic_model = BERTopic( embedding_model=embedding_model_pipe,
|
241 |
vectorizer_model=vectoriser_model,
|
242 |
umap_model=umap_model,
|
243 |
min_topic_size = min_docs_slider,
|
@@ -248,7 +252,11 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
248 |
|
249 |
topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
|
250 |
|
251 |
-
|
|
|
|
|
|
|
|
|
252 |
return "No topics found.", data_file_name, None, embeddings_out, data_file_name_no_ext, topic_model, docs, label_list
|
253 |
|
254 |
else:
|
@@ -264,9 +272,9 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
264 |
embeddings_file_name = data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
|
265 |
else:
|
266 |
if embeddings_super_compress == "No":
|
267 |
-
embeddings_file_name = data_file_name_no_ext + '_' + '
|
268 |
else:
|
269 |
-
embeddings_file_name = data_file_name_no_ext + '_' + '
|
270 |
|
271 |
np.savez_compressed(embeddings_file_name, embeddings_out)
|
272 |
|
@@ -464,7 +472,7 @@ with block:
|
|
464 |
gr.Markdown(
|
465 |
"""
|
466 |
# Topic modeller
|
467 |
-
Generate topics from open text in tabular data. Upload a file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics, and another for labels in the visualisation. If you have an embeddings .npz file of the text made using the '
|
468 |
|
469 |
Suggested test dataset: https://huggingface.co/datasets/rag-datasets/mini_wikipedia/tree/main/data (passages.parquet)
|
470 |
""")
|
|
|
4 |
import pandas as pd
|
5 |
import numpy as np
|
6 |
import time
|
7 |
+
|
8 |
+
from sentence_transformers import SentenceTransformer
|
9 |
from sklearn.feature_extraction.text import CountVectorizer
|
10 |
from transformers import AutoModel, AutoTokenizer
|
11 |
from transformers.pipelines import pipeline
|
|
|
81 |
|
82 |
# Pinning a Jina revision for security purposes: https://www.baseten.co/blog/pinning-ml-model-revisions-for-compatibility-and-security/
|
83 |
# Save Jina model locally as described here: https://huggingface.co/jinaai/jina-embeddings-v2-base-en/discussions/29
|
84 |
+
embeddings_name = "BAAI/bge-small-en-v1.5" #"jinaai/jina-embeddings-v2-base-en"
|
85 |
# local_embeddings_location = "model/jina/"
|
86 |
+
#revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
|
87 |
+
#revision_choice = "69d43700292701b06c24f43b96560566a4e5ad1f"
|
88 |
|
89 |
# Model used for representing topics
|
90 |
hf_model_name = 'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF' # 'second-state/stablelm-2-zephyr-1.6b-GGUF'
|
|
|
184 |
print("Low resource mode: ", low_resource_mode)
|
185 |
|
186 |
if low_resource_mode == "No":
|
187 |
+
print("Using high resource BGE transformer model")
|
188 |
+
|
189 |
+
|
|
|
|
|
|
|
|
|
190 |
|
191 |
+
embedding_model = SentenceTransformer(embeddings_name)
|
192 |
+
#try:
|
193 |
+
#embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True,device_map="auto") # For Jina
|
194 |
+
#except:
|
195 |
+
# embedding_model = AutoModel.from_pretrained(embeddings_name)#, revision = revision_choice, trust_remote_code=True, device_map="auto", use_auth_token=os.environ["HF_TOKEN"])
|
196 |
+
#tokenizer = AutoTokenizer.from_pretrained(embeddings_name)
|
197 |
+
#embedding_model_pipe = pipeline("feature-extraction", model=embedding_model, tokenizer=tokenizer)
|
198 |
|
199 |
# UMAP model uses Bertopic defaults
|
200 |
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, random_state=random_seed)
|
|
|
220 |
|
221 |
if not candidate_topics:
|
222 |
|
223 |
+
topic_model = BERTopic( embedding_model=embedding_model, #embedding_model_pipe, #for Jina
|
224 |
vectorizer_model=vectoriser_model,
|
225 |
umap_model=umap_model,
|
226 |
min_topic_size = min_docs_slider,
|
|
|
241 |
zero_shot_topics = read_file(candidate_topics.name)
|
242 |
zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
|
243 |
|
244 |
+
topic_model = BERTopic( embedding_model=embedding_model, #embedding_model_pipe, # for Jina
|
245 |
vectorizer_model=vectoriser_model,
|
246 |
umap_model=umap_model,
|
247 |
min_topic_size = min_docs_slider,
|
|
|
252 |
|
253 |
topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
|
254 |
|
255 |
+
print(topics_text)
|
256 |
+
|
257 |
+
if topics_text.size == 0:
|
258 |
+
# Handle the empty array case
|
259 |
+
|
260 |
return "No topics found.", data_file_name, None, embeddings_out, data_file_name_no_ext, topic_model, docs, label_list
|
261 |
|
262 |
else:
|
|
|
272 |
embeddings_file_name = data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
|
273 |
else:
|
274 |
if embeddings_super_compress == "No":
|
275 |
+
embeddings_file_name = data_file_name_no_ext + '_' + 'bge_embeddings.npz'
|
276 |
else:
|
277 |
+
embeddings_file_name = data_file_name_no_ext + '_' + 'bge_embeddings_compress.npz'
|
278 |
|
279 |
np.savez_compressed(embeddings_file_name, embeddings_out)
|
280 |
|
|
|
472 |
gr.Markdown(
|
473 |
"""
|
474 |
# Topic modeller
|
475 |
+
Generate topics from open text in tabular data. Upload a file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics, and another for labels in the visualisation. If you have an embeddings .npz file of the text made using the 'BAAI/bge-small-en-v1.5' model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available under the 'Options' tab.
|
476 |
|
477 |
Suggested test dataset: https://huggingface.co/datasets/rag-datasets/mini_wikipedia/tree/main/data (passages.parquet)
|
478 |
""")
|
funcs/embeddings.py
CHANGED
@@ -54,7 +54,8 @@ def make_or_load_embeddings(docs, file_list, embeddings_out, embedding_model, em
|
|
54 |
elif low_resource_mode_opt == "No":
|
55 |
print("Creating dense embeddings based on transformers model")
|
56 |
|
57 |
-
embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
|
|
|
58 |
|
59 |
toc = time.perf_counter()
|
60 |
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
|
|
54 |
elif low_resource_mode_opt == "No":
|
55 |
print("Creating dense embeddings based on transformers model")
|
56 |
|
57 |
+
#embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
|
58 |
+
embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32) # For BGE
|
59 |
|
60 |
toc = time.perf_counter()
|
61 |
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|