Spaces:
Running
Running
Sean-Case
commited on
Commit
·
0a543a0
1
Parent(s):
381f959
Should now parse custom regex correctly. Will now wipe previously created embeddings if 'low resource mode' option switched.
Browse files- app.py +7 -5
- funcs/clean_funcs.py +4 -11
- funcs/helper_functions.py +2 -2
- funcs/topic_core_funcs.py +18 -7
app.py
CHANGED
@@ -18,6 +18,7 @@ with block:
|
|
18 |
|
19 |
data_state = gr.State(pd.DataFrame())
|
20 |
embeddings_state = gr.State(np.array([]))
|
|
|
21 |
topic_model_state = gr.State()
|
22 |
custom_regex_state = gr.State(pd.DataFrame())
|
23 |
docs_state = gr.State()
|
@@ -43,12 +44,13 @@ with block:
|
|
43 |
|
44 |
with gr.Accordion("Clean data", open = False):
|
45 |
with gr.Row():
|
46 |
-
clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Clean data - remove html, numbers with > 1 digits, emails, postcodes (UK).")
|
47 |
drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 char strings. May make old embedding files incompatible due to differing lengths.")
|
48 |
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective. This is slow!")
|
49 |
with gr.Row():
|
50 |
-
gr.
|
51 |
-
|
|
|
52 |
clean_btn = gr.Button("Clean data")
|
53 |
|
54 |
with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
|
@@ -105,11 +107,11 @@ with block:
|
|
105 |
in_colnames.change(dummy_function, in_colnames, None)
|
106 |
|
107 |
# Clean data
|
108 |
-
custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_state])
|
109 |
clean_btn.click(fn=pre_clean, inputs=[data_state, in_colnames, data_file_name_no_ext_state, custom_regex_state, clean_text, drop_duplicate_text, anonymise_drop], outputs=[output_single_text, output_file, data_state, data_file_name_no_ext_state], api_name="clean")
|
110 |
|
111 |
# Extract topics
|
112 |
-
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state, zero_shot_similarity, seed_number, calc_probs, vectoriser_state], outputs=[output_single_text, output_file, embeddings_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state], api_name="topics")
|
113 |
|
114 |
# Reduce outliers
|
115 |
reduce_outliers_btn.click(fn=reduce_outliers, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, save_topic_model], outputs=[output_single_text, output_file, topic_model_state], api_name="reduce_outliers")
|
|
|
18 |
|
19 |
data_state = gr.State(pd.DataFrame())
|
20 |
embeddings_state = gr.State(np.array([]))
|
21 |
+
embeddings_type_state = gr.State("")
|
22 |
topic_model_state = gr.State()
|
23 |
custom_regex_state = gr.State(pd.DataFrame())
|
24 |
docs_state = gr.State()
|
|
|
44 |
|
45 |
with gr.Accordion("Clean data", open = False):
|
46 |
with gr.Row():
|
47 |
+
clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Clean data - remove html, numbers with > 1 digits, emails, postcodes (UK), custom regex.")
|
48 |
drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 char strings. May make old embedding files incompatible due to differing lengths.")
|
49 |
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective. This is slow!")
|
50 |
with gr.Row():
|
51 |
+
custom_regex = gr.UploadButton(label="Import custom regex file", file_count="multiple")
|
52 |
+
gr.Markdown("""Import custom regex - csv table with one column of regex patterns with header. Example pattern: (?i)roosevelt for case insensitive removal of this term.""")
|
53 |
+
custom_regex_text = gr.Textbox(label="Custom regex load status")
|
54 |
clean_btn = gr.Button("Clean data")
|
55 |
|
56 |
with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
|
|
|
107 |
in_colnames.change(dummy_function, in_colnames, None)
|
108 |
|
109 |
# Clean data
|
110 |
+
custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_text, custom_regex_state])
|
111 |
clean_btn.click(fn=pre_clean, inputs=[data_state, in_colnames, data_file_name_no_ext_state, custom_regex_state, clean_text, drop_duplicate_text, anonymise_drop], outputs=[output_single_text, output_file, data_state, data_file_name_no_ext_state], api_name="clean")
|
112 |
|
113 |
# Extract topics
|
114 |
+
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state, embeddings_type_state, zero_shot_similarity, seed_number, calc_probs, vectoriser_state], outputs=[output_single_text, output_file, embeddings_state, embeddings_type_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state], api_name="topics")
|
115 |
|
116 |
# Reduce outliers
|
117 |
reduce_outliers_btn.click(fn=reduce_outliers, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, save_topic_model], outputs=[output_single_text, output_file, topic_model_state], api_name="reduce_outliers")
|
funcs/clean_funcs.py
CHANGED
@@ -42,17 +42,10 @@ def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
|
42 |
# Allow for custom regex patterns to be removed
|
43 |
if len(custom_regex) > 0:
|
44 |
for pattern in custom_regex:
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
#text = text.str.replace_all(r'(?i)2nd floor civic centre', '')
|
50 |
-
#text = text.str.replace_all(r'(?i)6 brixton hill', '')
|
51 |
-
#text = text.str.replace_all(r'(?i)\bsocial care\b', '')
|
52 |
-
#text = text.str.replace_all(r'(?i)\basc\b', '')
|
53 |
-
#text = text.str.replace_all(r'(?i)\bcsc\b', '')
|
54 |
-
#text = text.str.replace_all(r'(?i)\blambeth\b', '')
|
55 |
-
|
56 |
text = text.to_list()
|
57 |
|
58 |
return text
|
|
|
42 |
# Allow for custom regex patterns to be removed
|
43 |
if len(custom_regex) > 0:
|
44 |
for pattern in custom_regex:
|
45 |
+
raw_string_pattern = r'{}'.format(pattern)
|
46 |
+
print("Removing regex pattern: ", raw_string_pattern)
|
47 |
+
text = text.str.replace_all(raw_string_pattern, '')
|
48 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
text = text.to_list()
|
50 |
|
51 |
return text
|
funcs/helper_functions.py
CHANGED
@@ -153,9 +153,9 @@ def custom_regex_load(in_file):
|
|
153 |
error = "No regex file provided."
|
154 |
print(error)
|
155 |
output_text = error
|
156 |
-
return custom_regex
|
157 |
|
158 |
-
return custom_regex
|
159 |
|
160 |
|
161 |
|
|
|
153 |
error = "No regex file provided."
|
154 |
print(error)
|
155 |
output_text = error
|
156 |
+
return error, custom_regex
|
157 |
|
158 |
+
return output_text, custom_regex
|
159 |
|
160 |
|
161 |
|
funcs/topic_core_funcs.py
CHANGED
@@ -126,7 +126,7 @@ def pre_clean(data, in_colnames, data_file_name_no_ext, custom_regex, clean_text
|
|
126 |
|
127 |
return output_text, output_list, data, data_file_name_no_ext
|
128 |
|
129 |
-
def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext, custom_labels_df, return_intermediate_files, embeddings_super_compress, low_resource_mode, save_topic_model, embeddings_out, zero_shot_similarity, random_seed, calc_probs, vectoriser_state, progress=gr.Progress(track_tqdm=True)):
|
130 |
|
131 |
all_tic = time.perf_counter()
|
132 |
|
@@ -161,7 +161,13 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
161 |
if low_resource_mode == "No":
|
162 |
print("Using high resource BGE transformer model")
|
163 |
|
164 |
-
embedding_model = SentenceTransformer(embeddings_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
|
166 |
# UMAP model uses Bertopic defaults
|
167 |
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, random_state=random_seed)
|
@@ -169,11 +175,16 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
169 |
else:
|
170 |
print("Choosing low resource TF-IDF model.")
|
171 |
|
172 |
-
|
173 |
TfidfVectorizer(),
|
174 |
TruncatedSVD(100, random_state=random_seed)
|
175 |
)
|
176 |
-
|
|
|
|
|
|
|
|
|
|
|
177 |
|
178 |
umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
|
179 |
|
@@ -246,7 +257,7 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
246 |
except:
|
247 |
print(fail_error_message)
|
248 |
|
249 |
-
return fail_error_message, output_list, embeddings_out, data_file_name_no_ext, None, docs, vectoriser_model
|
250 |
|
251 |
# For some reason, zero topic modelling exports assigned topics as a np.array instead of a list. Converting it back here.
|
252 |
if isinstance(assigned_topics, np.ndarray):
|
@@ -268,7 +279,7 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
268 |
|
269 |
if not assigned_topics:
|
270 |
# Handle the empty array case
|
271 |
-
return "No topics found.", output_list, embeddings_out, data_file_name_no_ext, topic_model, docs
|
272 |
|
273 |
else:
|
274 |
print("Topic model created.")
|
@@ -304,7 +315,7 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
304 |
time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
|
305 |
print(time_out)
|
306 |
|
307 |
-
return output_text, output_list, embeddings_out, data_file_name_no_ext, topic_model, docs, vectoriser_model
|
308 |
|
309 |
def reduce_outliers(topic_model, docs, embeddings_out, data_file_name_no_ext, save_topic_model, progress=gr.Progress(track_tqdm=True)):
|
310 |
|
|
|
126 |
|
127 |
return output_text, output_list, data, data_file_name_no_ext
|
128 |
|
129 |
+
def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext, custom_labels_df, return_intermediate_files, embeddings_super_compress, low_resource_mode, save_topic_model, embeddings_out, embeddings_type_state, zero_shot_similarity, random_seed, calc_probs, vectoriser_state, progress=gr.Progress(track_tqdm=True)):
|
130 |
|
131 |
all_tic = time.perf_counter()
|
132 |
|
|
|
161 |
if low_resource_mode == "No":
|
162 |
print("Using high resource BGE transformer model")
|
163 |
|
164 |
+
embedding_model = SentenceTransformer(embeddings_name)
|
165 |
+
|
166 |
+
# If tfidf embeddings currently exist, wipe these empty
|
167 |
+
if embeddings_type_state == "tfidf":
|
168 |
+
embeddings_out = np.array([])
|
169 |
+
|
170 |
+
embeddings_type_state = "bge"
|
171 |
|
172 |
# UMAP model uses Bertopic defaults
|
173 |
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, random_state=random_seed)
|
|
|
175 |
else:
|
176 |
print("Choosing low resource TF-IDF model.")
|
177 |
|
178 |
+
embedding_model = make_pipeline(
|
179 |
TfidfVectorizer(),
|
180 |
TruncatedSVD(100, random_state=random_seed)
|
181 |
)
|
182 |
+
|
183 |
+
# If bge embeddings currently exist, wipe these empty, then rename embeddings type
|
184 |
+
if embeddings_type_state == "bge":
|
185 |
+
embeddings_out = np.array([])
|
186 |
+
|
187 |
+
embeddings_type_state = "tfidf"
|
188 |
|
189 |
umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
|
190 |
|
|
|
257 |
except:
|
258 |
print(fail_error_message)
|
259 |
|
260 |
+
return fail_error_message, output_list, embeddings_out, embeddings_type_state, data_file_name_no_ext, None, docs, vectoriser_model
|
261 |
|
262 |
# For some reason, zero topic modelling exports assigned topics as a np.array instead of a list. Converting it back here.
|
263 |
if isinstance(assigned_topics, np.ndarray):
|
|
|
279 |
|
280 |
if not assigned_topics:
|
281 |
# Handle the empty array case
|
282 |
+
return "No topics found.", output_list, embeddings_out, embeddings_type_state, data_file_name_no_ext, topic_model, docs
|
283 |
|
284 |
else:
|
285 |
print("Topic model created.")
|
|
|
315 |
time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
|
316 |
print(time_out)
|
317 |
|
318 |
+
return output_text, output_list, embeddings_out, embeddings_type_state, data_file_name_no_ext, topic_model, docs, vectoriser_model
|
319 |
|
320 |
def reduce_outliers(topic_model, docs, embeddings_out, data_file_name_no_ext, save_topic_model, progress=gr.Progress(track_tqdm=True)):
|
321 |
|