Spaces:
Running
Running
Sean-Case
commited on
Commit
·
87306c7
1
Parent(s):
a7fdf3b
Some text changes. Fixed a couple of TF-IDF embeddings issues
Browse files- app.py +5 -5
- funcs/embeddings.py +0 -5
- funcs/topic_core_funcs.py +3 -3
app.py
CHANGED
@@ -26,9 +26,9 @@ with block:
|
|
26 |
gr.Markdown(
|
27 |
"""
|
28 |
# Topic modeller
|
29 |
-
Generate topics from open text in tabular data, based on [BERTopic](https://maartengr.github.io/BERTopic/). Upload a data file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics. Click 'Extract topics' after you have selected the minimum similar documents per topic and maximum total topics.
|
30 |
|
31 |
-
Uses fast TF-IDF-based embeddings by default,
|
32 |
|
33 |
I suggest [Wikipedia mini dataset](https://huggingface.co/datasets/rag-datasets/mini_wikipedia/tree/main/data) for testing the tool here, choose passages.parquet.
|
34 |
""")
|
@@ -51,8 +51,8 @@ with block:
|
|
51 |
zero_shot_similarity = gr.Slider(minimum = 0.5, maximum = 1, value = 0.65, step = 0.001, label = "Minimum similarity value for document to be assigned to zero-shot topic.")
|
52 |
|
53 |
with gr.Row():
|
54 |
-
min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value =
|
55 |
-
max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value =
|
56 |
|
57 |
with gr.Row():
|
58 |
topics_btn = gr.Button("Extract topics", variant="primary")
|
@@ -89,7 +89,7 @@ with block:
|
|
89 |
seed_number = gr.Number(label="Random seed to use for dimensionality reduction.", minimum=0, step=1, value=42, precision=0)
|
90 |
calc_probs = gr.Dropdown(label="Calculate all topic probabilities", value="No", choices=["Yes", "No"])
|
91 |
with gr.Row():
|
92 |
-
low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings and processing.", value="Yes", choices=["Yes", "No"])
|
93 |
embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
|
94 |
with gr.Row():
|
95 |
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation.", value="Yes", choices=["Yes", "No"])
|
|
|
26 |
gr.Markdown(
|
27 |
"""
|
28 |
# Topic modeller
|
29 |
+
Generate topics from open text in tabular data, based on [BERTopic](https://maartengr.github.io/BERTopic/). Upload a data file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics. Click 'Extract topics' after you have selected the minimum similar documents per topic and maximum total topics. Duplicate this space, or clone to your computer to avoid queues here!
|
30 |
|
31 |
+
Uses fast TF-IDF-based embeddings by default, which are fast but not very performant in terms of cluster. Change to [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) model embeddings on the options page for topics of much higher quality, but slower processing time. If you have an embeddings .npz file previously made using this model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics for zero-shot modelling, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available under the 'Options' tab. Topic representation with LLMs currently based on [StableLM-2-Zephyr-1.6B-GGUF](https://huggingface.co/second-state/stablelm-2-zephyr-1.6b-GGUF).
|
32 |
|
33 |
I suggest [Wikipedia mini dataset](https://huggingface.co/datasets/rag-datasets/mini_wikipedia/tree/main/data) for testing the tool here, choose passages.parquet.
|
34 |
""")
|
|
|
51 |
zero_shot_similarity = gr.Slider(minimum = 0.5, maximum = 1, value = 0.65, step = 0.001, label = "Minimum similarity value for document to be assigned to zero-shot topic.")
|
52 |
|
53 |
with gr.Row():
|
54 |
+
min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 5, step = 1, label = "Minimum number of similar documents needed to make a topic.")
|
55 |
+
max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 50, step = 1, label = "Maximum number of topics")
|
56 |
|
57 |
with gr.Row():
|
58 |
topics_btn = gr.Button("Extract topics", variant="primary")
|
|
|
89 |
seed_number = gr.Number(label="Random seed to use for dimensionality reduction.", minimum=0, step=1, value=42, precision=0)
|
90 |
calc_probs = gr.Dropdown(label="Calculate all topic probabilities", value="No", choices=["Yes", "No"])
|
91 |
with gr.Row():
|
92 |
+
low_resource_mode_opt = gr.Dropdown(label = "Use low resource (TF-IDF) embeddings and processing.", value="Yes", choices=["Yes", "No"])
|
93 |
embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
|
94 |
with gr.Row():
|
95 |
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation.", value="Yes", choices=["Yes", "No"])
|
funcs/embeddings.py
CHANGED
@@ -38,11 +38,6 @@ def make_or_load_embeddings(docs, file_list, embeddings_out, embedding_model, em
|
|
38 |
if low_resource_mode_opt == "Yes":
|
39 |
print("Creating simplified 'sparse' embeddings based on TfIDF")
|
40 |
|
41 |
-
embedding_model = make_pipeline(
|
42 |
-
TfidfVectorizer(),
|
43 |
-
TruncatedSVD(100, random_state=random_seed)
|
44 |
-
)
|
45 |
-
|
46 |
# Fit the pipeline to the text data
|
47 |
embedding_model.fit(docs)
|
48 |
|
|
|
38 |
if low_resource_mode_opt == "Yes":
|
39 |
print("Creating simplified 'sparse' embeddings based on TfIDF")
|
40 |
|
|
|
|
|
|
|
|
|
|
|
41 |
# Fit the pipeline to the text data
|
42 |
embedding_model.fit(docs)
|
43 |
|
funcs/topic_core_funcs.py
CHANGED
@@ -163,12 +163,12 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
163 |
# UMAP model uses Bertopic defaults
|
164 |
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, random_state=random_seed)
|
165 |
|
166 |
-
|
167 |
print("Choosing low resource TF-IDF model.")
|
168 |
|
169 |
embedding_model_pipe = make_pipeline(
|
170 |
TfidfVectorizer(),
|
171 |
-
TruncatedSVD(100)
|
172 |
)
|
173 |
embedding_model = embedding_model_pipe
|
174 |
|
@@ -209,7 +209,7 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
209 |
error_message = "Zero shot topic modelling currently not compatible with low-resource embeddings. Please change this option to 'No' on the options tab and retry."
|
210 |
print(error_message)
|
211 |
|
212 |
-
return error_message, output_list, embeddings_out, data_file_name_no_ext, None, docs
|
213 |
|
214 |
zero_shot_topics = read_file(candidate_topics.name)
|
215 |
zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
|
|
|
163 |
# UMAP model uses Bertopic defaults
|
164 |
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, random_state=random_seed)
|
165 |
|
166 |
+
else:
|
167 |
print("Choosing low resource TF-IDF model.")
|
168 |
|
169 |
embedding_model_pipe = make_pipeline(
|
170 |
TfidfVectorizer(),
|
171 |
+
TruncatedSVD(100, random_state=random_seed)
|
172 |
)
|
173 |
embedding_model = embedding_model_pipe
|
174 |
|
|
|
209 |
error_message = "Zero shot topic modelling currently not compatible with low-resource embeddings. Please change this option to 'No' on the options tab and retry."
|
210 |
print(error_message)
|
211 |
|
212 |
+
return error_message, output_list, embeddings_out, data_file_name_no_ext, None, docs, vectoriser_model
|
213 |
|
214 |
zero_shot_topics = read_file(candidate_topics.name)
|
215 |
zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
|