Spaces:

seanpedrickcase
/

topic_modelling

Running

App Files Files Community

Sonnyjim commited on Jan 25, 2024

Commit

9eeba1e

1 Parent(s): 32cf9fb

App should now check if embeddings are loaded before topic modelling. And will save only once.

Browse files

Files changed (3) hide show

app.py +13 -10
funcs/embeddings.py +41 -41
funcs/helper_functions.py +4 -2

app.py CHANGED Viewed

@@ -80,8 +80,8 @@ hf_model_name =  'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1
 hf_model_file =   'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
-def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers):
     all_tic = time.perf_counter()
     output_list = []
@@ -144,7 +144,7 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
         umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
-    embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
     vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
@@ -272,12 +272,16 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
         zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
         output_list.append(topic_model_save_name_zip)
     if return_intermediate_files == "Yes":
         print("Saving embeddings to file")
         if low_resource_mode == "Yes":
             embeddings_file_name = data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
         else:
-            embeddings_file_name = data_file_name_no_ext + '_' + 'ai_embeddings.npz'
         np.savez_compressed(embeddings_file_name, embeddings_out)
@@ -297,15 +301,13 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
         time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
         print(time_out)
-        return output_text, output_list, topics_vis
     all_toc = time.perf_counter()
     time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
     print(time_out)
-    return output_text, output_list, None
-# , topic_model_save_name
 # ## Gradio app - extract topics
@@ -314,6 +316,7 @@ block = gr.Blocks(theme = gr.themes.Base())
 with block:
     data_state = gr.State(pd.DataFrame())
     gr.Markdown(
     """
@@ -359,10 +362,10 @@ with block:
                 visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="No", choices=["Yes", "No"])
     # Update column names dropdown when file uploaded
-    in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])
     in_colnames.change(dummy_function, in_colnames, None)
-    topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers], outputs=[output_single_text, output_file, plot], api_name="topics")
 block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)

 hf_model_file =   'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
+def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers, embeddings_out):
     all_tic = time.perf_counter()
     output_list = []
         umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
+    embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embeddings_out, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
     vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
         zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
         output_list.append(topic_model_save_name_zip)
+     # If you want to save your embedding files
     if return_intermediate_files == "Yes":
         print("Saving embeddings to file")
         if low_resource_mode == "Yes":
             embeddings_file_name = data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
         else:
+            if embeddings_super_compress == "No":
+                embeddings_file_name = data_file_name_no_ext + '_' + 'ai_embeddings.npz'
+            else:
+                embeddings_file_name = data_file_name_no_ext + '_' + 'ai_embedding_compress.npz'
         np.savez_compressed(embeddings_file_name, embeddings_out)
         time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
         print(time_out)
+        return output_text, output_list, topics_vis, embeddings_out
     all_toc = time.perf_counter()
     time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
     print(time_out)
+    return output_text, output_list, None, embeddings_out
 # ## Gradio app - extract topics
 with block:
     data_state = gr.State(pd.DataFrame())
+    embeddings_state = gr.State(np.array([]))
     gr.Markdown(
     """
                 visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="No", choices=["Yes", "No"])
     # Update column names dropdown when file uploaded
+    in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state, embeddings_state])
     in_colnames.change(dummy_function, in_colnames, None)
+    topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers, embeddings_state], outputs=[output_single_text, output_file, plot, embeddings_state], api_name="topics")
 block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)

funcs/embeddings.py CHANGED Viewed

@@ -13,60 +13,60 @@ if cuda.is_available():
 else:
     torch_device =  "cpu"
-def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt, reduce_embeddings="Yes"):
-    embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
-    if embeddings_file_names:
-        print("Loading embeddings from file.")
-        embeddings_out = np.load(embeddings_file_names[0])['arr_0']
-        # If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
-        if "compress" in embeddings_file_names[0]:
-            embeddings_out /= 100
-    if not embeddings_file_names:
-        tic = time.perf_counter()
-        print("Starting to embed documents.")
-        # Custom model
-        # If on CPU, don't resort to embedding models
-        if low_resource_mode_opt == "Yes":
-            print("Creating simplified 'sparse' embeddings based on TfIDF")
-            embedding_model = make_pipeline(
-            TfidfVectorizer(),
-            TruncatedSVD(100, random_state=random_seed)
-            )
-            # Fit the pipeline to the text data
-            embedding_model.fit(docs)
-            # Transform text data to embeddings
-            embeddings_out = embedding_model.transform(docs)
-            #embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32)
-        elif low_resource_mode_opt == "No":
-            print("Creating dense embeddings based on transformers model")
-            embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
-        toc = time.perf_counter()
-        time_out = f"The embedding took {toc - tic:0.1f} seconds"
-        print(time_out)
-        # If you want to save your files for next time
-        if return_intermediate_files == "Yes":
-            print("Saving embeddings to file")
-            if embeddings_super_compress == "No":
-                semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
-                np.savez_compressed(semantic_search_file_name, embeddings_out)
-            else:
-                semantic_search_file_name = data_file_name_no_ext + '_' + 'embedding_compress.npz'
-                embeddings_out_round = np.round(embeddings_out, 3)
-                embeddings_out_round *= 100 # Rounding not currently used
-                np.savez_compressed(semantic_search_file_name, embeddings_out_round)
     # Pre-reduce embeddings for visualisation purposes
     if reduce_embeddings == "Yes":

 else:
     torch_device =  "cpu"
+def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embeddings_out, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt, reduce_embeddings="Yes"):
+    # If no embeddings found, make or load in
+    if embeddings_out.size == 0:
+        print("Embeddings not found. Loading or generating new ones.")
+        embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
+        if embeddings_file_names:
+            print("Loading embeddings from file.")
+            embeddings_out = np.load(embeddings_file_names[0])['arr_0']
+            # If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
+            if "compress" in embeddings_file_names[0]:
+                embeddings_out /= 100
+        if not embeddings_file_names:
+            tic = time.perf_counter()
+            print("Starting to embed documents.")
+            # Custom model
+            # If on CPU, don't resort to embedding models
+            if low_resource_mode_opt == "Yes":
+                print("Creating simplified 'sparse' embeddings based on TfIDF")
+                embedding_model = make_pipeline(
+                TfidfVectorizer(),
+                TruncatedSVD(100, random_state=random_seed)
+                )
+                # Fit the pipeline to the text data
+                embedding_model.fit(docs)
+                # Transform text data to embeddings
+                embeddings_out = embedding_model.transform(docs)
+                #embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32)
+            elif low_resource_mode_opt == "No":
+                print("Creating dense embeddings based on transformers model")
+                embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
+            toc = time.perf_counter()
+            time_out = f"The embedding took {toc - tic:0.1f} seconds"
+            print(time_out)
+           # If the user has chosen to go with super compressed embedding files to save disk space
+            if embeddings_super_compress == "Yes":
+                embeddings_out = np.round(embeddings_out, 3)
+                embeddings_out *= 100
+    else:
+        print("Found pre-loaded embeddings.")
     # Pre-reduce embeddings for visualisation purposes
     if reduce_embeddings == "Yes":

funcs/helper_functions.py CHANGED Viewed

@@ -5,6 +5,7 @@ import pandas as pd
 import gradio as gr
 import gzip
 import pickle
 def detect_file_type(filename):
@@ -62,8 +63,9 @@ def put_columns_in_df(in_file, in_bm25_column):
     concat_choices.extend(new_choices)
-    return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df
 def get_file_path_end(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")

 import gradio as gr
 import gzip
 import pickle
+import numpy as np
 def detect_file_type(filename):
     concat_choices.extend(new_choices)
+    #The np.array([]) at the end is for clearing the embedding state when a new file is loaded
+    return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, np.array([])
 def get_file_path_end(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")