Sean-Case commited on
Commit
0a543a0
1 Parent(s): 381f959

Should now parse custom regex correctly. Will now wipe previously created embeddings if 'low resource mode' option switched.

Browse files
app.py CHANGED
@@ -18,6 +18,7 @@ with block:
18
 
19
  data_state = gr.State(pd.DataFrame())
20
  embeddings_state = gr.State(np.array([]))
 
21
  topic_model_state = gr.State()
22
  custom_regex_state = gr.State(pd.DataFrame())
23
  docs_state = gr.State()
@@ -43,12 +44,13 @@ with block:
43
 
44
  with gr.Accordion("Clean data", open = False):
45
  with gr.Row():
46
- clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Clean data - remove html, numbers with > 1 digits, emails, postcodes (UK).")
47
  drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 char strings. May make old embedding files incompatible due to differing lengths.")
48
  anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective. This is slow!")
49
  with gr.Row():
50
- gr.Markdown("""Import custom regex - csv table with one column of raw text regex patterns with header. Example pattern: r'example'""")
51
- custom_regex = gr.UploadButton(label="Import custom regex file", file_count="multiple")
 
52
  clean_btn = gr.Button("Clean data")
53
 
54
  with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
@@ -105,11 +107,11 @@ with block:
105
  in_colnames.change(dummy_function, in_colnames, None)
106
 
107
  # Clean data
108
- custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_state])
109
  clean_btn.click(fn=pre_clean, inputs=[data_state, in_colnames, data_file_name_no_ext_state, custom_regex_state, clean_text, drop_duplicate_text, anonymise_drop], outputs=[output_single_text, output_file, data_state, data_file_name_no_ext_state], api_name="clean")
110
 
111
  # Extract topics
112
- topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state, zero_shot_similarity, seed_number, calc_probs, vectoriser_state], outputs=[output_single_text, output_file, embeddings_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state], api_name="topics")
113
 
114
  # Reduce outliers
115
  reduce_outliers_btn.click(fn=reduce_outliers, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, save_topic_model], outputs=[output_single_text, output_file, topic_model_state], api_name="reduce_outliers")
 
18
 
19
  data_state = gr.State(pd.DataFrame())
20
  embeddings_state = gr.State(np.array([]))
21
+ embeddings_type_state = gr.State("")
22
  topic_model_state = gr.State()
23
  custom_regex_state = gr.State(pd.DataFrame())
24
  docs_state = gr.State()
 
44
 
45
  with gr.Accordion("Clean data", open = False):
46
  with gr.Row():
47
+ clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Clean data - remove html, numbers with > 1 digits, emails, postcodes (UK), custom regex.")
48
  drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 char strings. May make old embedding files incompatible due to differing lengths.")
49
  anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective. This is slow!")
50
  with gr.Row():
51
+ custom_regex = gr.UploadButton(label="Import custom regex file", file_count="multiple")
52
+ gr.Markdown("""Import custom regex - csv table with one column of regex patterns with header. Example pattern: (?i)roosevelt for case insensitive removal of this term.""")
53
+ custom_regex_text = gr.Textbox(label="Custom regex load status")
54
  clean_btn = gr.Button("Clean data")
55
 
56
  with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
 
107
  in_colnames.change(dummy_function, in_colnames, None)
108
 
109
  # Clean data
110
+ custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_text, custom_regex_state])
111
  clean_btn.click(fn=pre_clean, inputs=[data_state, in_colnames, data_file_name_no_ext_state, custom_regex_state, clean_text, drop_duplicate_text, anonymise_drop], outputs=[output_single_text, output_file, data_state, data_file_name_no_ext_state], api_name="clean")
112
 
113
  # Extract topics
114
+ topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state, embeddings_type_state, zero_shot_similarity, seed_number, calc_probs, vectoriser_state], outputs=[output_single_text, output_file, embeddings_state, embeddings_type_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state], api_name="topics")
115
 
116
  # Reduce outliers
117
  reduce_outliers_btn.click(fn=reduce_outliers, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, save_topic_model], outputs=[output_single_text, output_file, topic_model_state], api_name="reduce_outliers")
funcs/clean_funcs.py CHANGED
@@ -42,17 +42,10 @@ def initial_clean(texts, custom_regex, progress=gr.Progress()):
42
  # Allow for custom regex patterns to be removed
43
  if len(custom_regex) > 0:
44
  for pattern in custom_regex:
45
- text = text.str.replace_all(pattern, '')
46
-
47
- #text = text.str.replace_all(warning_pattern_regex, '') # This one is quite particular to Lambeth emails
48
- #text = text.str.replace_all(egress_pattern_regex, '')
49
- #text = text.str.replace_all(r'(?i)2nd floor civic centre', '')
50
- #text = text.str.replace_all(r'(?i)6 brixton hill', '')
51
- #text = text.str.replace_all(r'(?i)\bsocial care\b', '')
52
- #text = text.str.replace_all(r'(?i)\basc\b', '')
53
- #text = text.str.replace_all(r'(?i)\bcsc\b', '')
54
- #text = text.str.replace_all(r'(?i)\blambeth\b', '')
55
-
56
  text = text.to_list()
57
 
58
  return text
 
42
  # Allow for custom regex patterns to be removed
43
  if len(custom_regex) > 0:
44
  for pattern in custom_regex:
45
+ raw_string_pattern = r'{}'.format(pattern)
46
+ print("Removing regex pattern: ", raw_string_pattern)
47
+ text = text.str.replace_all(raw_string_pattern, '')
48
+
 
 
 
 
 
 
 
49
  text = text.to_list()
50
 
51
  return text
funcs/helper_functions.py CHANGED
@@ -153,9 +153,9 @@ def custom_regex_load(in_file):
153
  error = "No regex file provided."
154
  print(error)
155
  output_text = error
156
- return custom_regex
157
 
158
- return custom_regex
159
 
160
 
161
 
 
153
  error = "No regex file provided."
154
  print(error)
155
  output_text = error
156
+ return error, custom_regex
157
 
158
+ return output_text, custom_regex
159
 
160
 
161
 
funcs/topic_core_funcs.py CHANGED
@@ -126,7 +126,7 @@ def pre_clean(data, in_colnames, data_file_name_no_ext, custom_regex, clean_text
126
 
127
  return output_text, output_list, data, data_file_name_no_ext
128
 
129
- def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext, custom_labels_df, return_intermediate_files, embeddings_super_compress, low_resource_mode, save_topic_model, embeddings_out, zero_shot_similarity, random_seed, calc_probs, vectoriser_state, progress=gr.Progress(track_tqdm=True)):
130
 
131
  all_tic = time.perf_counter()
132
 
@@ -161,7 +161,13 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
161
  if low_resource_mode == "No":
162
  print("Using high resource BGE transformer model")
163
 
164
- embedding_model = SentenceTransformer(embeddings_name)
 
 
 
 
 
 
165
 
166
  # UMAP model uses Bertopic defaults
167
  umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, random_state=random_seed)
@@ -169,11 +175,16 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
169
  else:
170
  print("Choosing low resource TF-IDF model.")
171
 
172
- embedding_model_pipe = make_pipeline(
173
  TfidfVectorizer(),
174
  TruncatedSVD(100, random_state=random_seed)
175
  )
176
- embedding_model = embedding_model_pipe
 
 
 
 
 
177
 
178
  umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
179
 
@@ -246,7 +257,7 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
246
  except:
247
  print(fail_error_message)
248
 
249
- return fail_error_message, output_list, embeddings_out, data_file_name_no_ext, None, docs, vectoriser_model
250
 
251
  # For some reason, zero topic modelling exports assigned topics as a np.array instead of a list. Converting it back here.
252
  if isinstance(assigned_topics, np.ndarray):
@@ -268,7 +279,7 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
268
 
269
  if not assigned_topics:
270
  # Handle the empty array case
271
- return "No topics found.", output_list, embeddings_out, data_file_name_no_ext, topic_model, docs
272
 
273
  else:
274
  print("Topic model created.")
@@ -304,7 +315,7 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
304
  time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
305
  print(time_out)
306
 
307
- return output_text, output_list, embeddings_out, data_file_name_no_ext, topic_model, docs, vectoriser_model
308
 
309
  def reduce_outliers(topic_model, docs, embeddings_out, data_file_name_no_ext, save_topic_model, progress=gr.Progress(track_tqdm=True)):
310
 
 
126
 
127
  return output_text, output_list, data, data_file_name_no_ext
128
 
129
+ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext, custom_labels_df, return_intermediate_files, embeddings_super_compress, low_resource_mode, save_topic_model, embeddings_out, embeddings_type_state, zero_shot_similarity, random_seed, calc_probs, vectoriser_state, progress=gr.Progress(track_tqdm=True)):
130
 
131
  all_tic = time.perf_counter()
132
 
 
161
  if low_resource_mode == "No":
162
  print("Using high resource BGE transformer model")
163
 
164
+ embedding_model = SentenceTransformer(embeddings_name)
165
+
166
+ # If tfidf embeddings currently exist, wipe these empty
167
+ if embeddings_type_state == "tfidf":
168
+ embeddings_out = np.array([])
169
+
170
+ embeddings_type_state = "bge"
171
 
172
  # UMAP model uses Bertopic defaults
173
  umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, random_state=random_seed)
 
175
  else:
176
  print("Choosing low resource TF-IDF model.")
177
 
178
+ embedding_model = make_pipeline(
179
  TfidfVectorizer(),
180
  TruncatedSVD(100, random_state=random_seed)
181
  )
182
+
183
+ # If bge embeddings currently exist, wipe these empty, then rename embeddings type
184
+ if embeddings_type_state == "bge":
185
+ embeddings_out = np.array([])
186
+
187
+ embeddings_type_state = "tfidf"
188
 
189
  umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
190
 
 
257
  except:
258
  print(fail_error_message)
259
 
260
+ return fail_error_message, output_list, embeddings_out, embeddings_type_state, data_file_name_no_ext, None, docs, vectoriser_model
261
 
262
  # For some reason, zero topic modelling exports assigned topics as a np.array instead of a list. Converting it back here.
263
  if isinstance(assigned_topics, np.ndarray):
 
279
 
280
  if not assigned_topics:
281
  # Handle the empty array case
282
+ return "No topics found.", output_list, embeddings_out, embeddings_type_state, data_file_name_no_ext, topic_model, docs
283
 
284
  else:
285
  print("Topic model created.")
 
315
  time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
316
  print(time_out)
317
 
318
+ return output_text, output_list, embeddings_out, embeddings_type_state, data_file_name_no_ext, topic_model, docs, vectoriser_model
319
 
320
  def reduce_outliers(topic_model, docs, embeddings_out, data_file_name_no_ext, save_topic_model, progress=gr.Progress(track_tqdm=True)):
321