Spaces:

seanpedrickcase
/

topic_modelling

Running

App Files Files Community

Sean-Case commited on Feb 7, 2024

Commit

0a543a0

1 Parent(s): 381f959

Should now parse custom regex correctly. Will now wipe previously created embeddings if 'low resource mode' option switched.

Browse files

Files changed (4) hide show

app.py +7 -5
funcs/clean_funcs.py +4 -11
funcs/helper_functions.py +2 -2
funcs/topic_core_funcs.py +18 -7

app.py CHANGED Viewed

@@ -18,6 +18,7 @@ with block:
     data_state = gr.State(pd.DataFrame())
     embeddings_state = gr.State(np.array([]))
     topic_model_state = gr.State()
     custom_regex_state = gr.State(pd.DataFrame())
     docs_state = gr.State()
@@ -43,12 +44,13 @@ with block:
         with gr.Accordion("Clean data", open = False):
             with gr.Row():
-                clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Clean data - remove html, numbers with > 1 digits, emails, postcodes (UK).")
                 drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 char strings. May make old embedding files incompatible due to differing lengths.")
                 anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective. This is slow!")
             with gr.Row():
-                gr.Markdown("""Import custom regex - csv table with one column of raw text regex patterns with header. Example pattern: r'example'""")
-                custom_regex = gr.UploadButton(label="Import custom regex file", file_count="multiple")
             clean_btn = gr.Button("Clean data")
         with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
@@ -105,11 +107,11 @@ with block:
     in_colnames.change(dummy_function, in_colnames, None)
     # Clean data
-    custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_state])
     clean_btn.click(fn=pre_clean, inputs=[data_state, in_colnames, data_file_name_no_ext_state, custom_regex_state, clean_text, drop_duplicate_text, anonymise_drop], outputs=[output_single_text, output_file, data_state, data_file_name_no_ext_state], api_name="clean")
     # Extract topics
-    topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state, zero_shot_similarity, seed_number, calc_probs, vectoriser_state], outputs=[output_single_text, output_file, embeddings_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state], api_name="topics")
     # Reduce outliers
     reduce_outliers_btn.click(fn=reduce_outliers, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, save_topic_model], outputs=[output_single_text, output_file, topic_model_state], api_name="reduce_outliers")

     data_state = gr.State(pd.DataFrame())
     embeddings_state = gr.State(np.array([]))
+    embeddings_type_state = gr.State("")
     topic_model_state = gr.State()
     custom_regex_state = gr.State(pd.DataFrame())
     docs_state = gr.State()
         with gr.Accordion("Clean data", open = False):
             with gr.Row():
+                clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Clean data - remove html, numbers with > 1 digits, emails, postcodes (UK), custom regex.")
                 drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 char strings. May make old embedding files incompatible due to differing lengths.")
                 anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective. This is slow!")
             with gr.Row():
+                custom_regex = gr.UploadButton(label="Import custom regex file", file_count="multiple")
+                gr.Markdown("""Import custom regex - csv table with one column of regex patterns with header. Example pattern: (?i)roosevelt for case insensitive removal of this term.""")
+                custom_regex_text = gr.Textbox(label="Custom regex load status")
             clean_btn = gr.Button("Clean data")
         with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
     in_colnames.change(dummy_function, in_colnames, None)
     # Clean data
+    custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_text, custom_regex_state])
     clean_btn.click(fn=pre_clean, inputs=[data_state, in_colnames, data_file_name_no_ext_state, custom_regex_state, clean_text, drop_duplicate_text, anonymise_drop], outputs=[output_single_text, output_file, data_state, data_file_name_no_ext_state], api_name="clean")
     # Extract topics
+    topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state, embeddings_type_state, zero_shot_similarity, seed_number, calc_probs, vectoriser_state], outputs=[output_single_text, output_file, embeddings_state, embeddings_type_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state], api_name="topics")
     # Reduce outliers
     reduce_outliers_btn.click(fn=reduce_outliers, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, save_topic_model], outputs=[output_single_text, output_file, topic_model_state], api_name="reduce_outliers")

funcs/clean_funcs.py CHANGED Viewed

@@ -42,17 +42,10 @@ def initial_clean(texts, custom_regex, progress=gr.Progress()):
     # Allow for custom regex patterns to be removed
     if len(custom_regex) > 0:
         for pattern in custom_regex:
-            text = text.str.replace_all(pattern, '')
-    #text = text.str.replace_all(warning_pattern_regex, '') # This one is quite particular to Lambeth emails
-    #text = text.str.replace_all(egress_pattern_regex, '')
-    #text = text.str.replace_all(r'(?i)2nd floor civic centre', '')
-    #text = text.str.replace_all(r'(?i)6 brixton hill', '')
-    #text = text.str.replace_all(r'(?i)\bsocial care\b', '')
-    #text = text.str.replace_all(r'(?i)\basc\b', '')
-    #text = text.str.replace_all(r'(?i)\bcsc\b', '')
-    #text = text.str.replace_all(r'(?i)\blambeth\b', '')
     text = text.to_list()
     return text

     # Allow for custom regex patterns to be removed
     if len(custom_regex) > 0:
         for pattern in custom_regex:
+            raw_string_pattern = r'{}'.format(pattern)
+            print("Removing regex pattern: ", raw_string_pattern)
+            text = text.str.replace_all(raw_string_pattern, '')
     text = text.to_list()
     return text

funcs/helper_functions.py CHANGED Viewed

@@ -153,9 +153,9 @@ def custom_regex_load(in_file):
         error = "No regex file provided."
         print(error)
         output_text = error
-        return custom_regex
-    return custom_regex

         error = "No regex file provided."
         print(error)
         output_text = error
+        return error, custom_regex
+    return output_text, custom_regex

funcs/topic_core_funcs.py CHANGED Viewed

@@ -126,7 +126,7 @@ def pre_clean(data, in_colnames, data_file_name_no_ext, custom_regex, clean_text
     return output_text, output_list, data, data_file_name_no_ext
-def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext, custom_labels_df, return_intermediate_files, embeddings_super_compress, low_resource_mode, save_topic_model, embeddings_out, zero_shot_similarity, random_seed, calc_probs, vectoriser_state, progress=gr.Progress(track_tqdm=True)):
     all_tic = time.perf_counter()
@@ -161,7 +161,13 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
     if low_resource_mode == "No":
         print("Using high resource BGE transformer model")
-        embedding_model = SentenceTransformer(embeddings_name)
         # UMAP model uses Bertopic defaults
         umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, random_state=random_seed)
@@ -169,11 +175,16 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
     else:
         print("Choosing low resource TF-IDF model.")
-        embedding_model_pipe = make_pipeline(
                 TfidfVectorizer(),
                 TruncatedSVD(100, random_state=random_seed)
                 )
-        embedding_model = embedding_model_pipe
         umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
@@ -246,7 +257,7 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
         except:
             print(fail_error_message)
-            return fail_error_message, output_list, embeddings_out, data_file_name_no_ext, None, docs, vectoriser_model
         # For some reason, zero topic modelling exports assigned topics as a np.array instead of a list. Converting it back here.
         if isinstance(assigned_topics, np.ndarray):
@@ -268,7 +279,7 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
     if not assigned_topics:
     # Handle the empty array case
-        return "No topics found.", output_list, embeddings_out, data_file_name_no_ext, topic_model, docs
     else:
         print("Topic model created.")
@@ -304,7 +315,7 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
     time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
     print(time_out)
-    return output_text, output_list, embeddings_out, data_file_name_no_ext, topic_model, docs, vectoriser_model
 def reduce_outliers(topic_model, docs, embeddings_out, data_file_name_no_ext, save_topic_model, progress=gr.Progress(track_tqdm=True)):

     return output_text, output_list, data, data_file_name_no_ext
+def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext, custom_labels_df, return_intermediate_files, embeddings_super_compress, low_resource_mode, save_topic_model, embeddings_out, embeddings_type_state, zero_shot_similarity, random_seed, calc_probs, vectoriser_state, progress=gr.Progress(track_tqdm=True)):
     all_tic = time.perf_counter()
     if low_resource_mode == "No":
         print("Using high resource BGE transformer model")
+        embedding_model = SentenceTransformer(embeddings_name)
+        # If tfidf embeddings currently exist, wipe these empty
+        if embeddings_type_state == "tfidf":
+            embeddings_out = np.array([])
+        embeddings_type_state = "bge"
         # UMAP model uses Bertopic defaults
         umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, random_state=random_seed)
     else:
         print("Choosing low resource TF-IDF model.")
+        embedding_model = make_pipeline(
                 TfidfVectorizer(),
                 TruncatedSVD(100, random_state=random_seed)
                 )
+        # If bge embeddings currently exist, wipe these empty, then rename embeddings type
+        if embeddings_type_state == "bge":
+            embeddings_out = np.array([])
+        embeddings_type_state = "tfidf"
         umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
         except:
             print(fail_error_message)
+            return fail_error_message, output_list, embeddings_out, embeddings_type_state, data_file_name_no_ext, None, docs, vectoriser_model
         # For some reason, zero topic modelling exports assigned topics as a np.array instead of a list. Converting it back here.
         if isinstance(assigned_topics, np.ndarray):
     if not assigned_topics:
     # Handle the empty array case
+        return "No topics found.", output_list, embeddings_out, embeddings_type_state, data_file_name_no_ext, topic_model, docs
     else:
         print("Topic model created.")
     time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
     print(time_out)
+    return output_text, output_list, embeddings_out, embeddings_type_state, data_file_name_no_ext, topic_model, docs, vectoriser_model
 def reduce_outliers(topic_model, docs, embeddings_out, data_file_name_no_ext, save_topic_model, progress=gr.Progress(track_tqdm=True)):