Spaces:

seanpedrickcase
/

topic_modelling

Running

App Files Files Community

Sonnyjim commited on Jan 29, 2024

Commit

b4510a6

1 Parent(s): 1f1a1c7

Lots of general fixes. New visualisations, fixed hierarchical vis for zero shot. Added calc all probabilities.

Browse files

Files changed (8) hide show

Topic modeller to do.txt +0 -13
app.py +205 -155
funcs/anonymiser.py +0 -1
funcs/bertopic_vis_documents.py +470 -0
funcs/embeddings.py +6 -6
funcs/helper_functions.py +86 -12
funcs/representation_model.py +1 -1
requirements.txt +11 -10

Topic modeller to do.txt DELETED Viewed

@@ -1,13 +0,0 @@
-Need to add option to anonymise - done
-Need to add option to deduplicate
-Need option to sample for X number of rows with specific seed
-Add plotly visualisation - done
-Add zero shot topic list support
-Add topic renaming with LLMs - done
-Option to predict topics on a new dataset - done (kind of - just save model to file)

app.py CHANGED Viewed

@@ -1,4 +1,8 @@
 import os
 import gradio as gr
 from datetime import datetime
 import pandas as pd
@@ -7,8 +11,6 @@ import time
 from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import CountVectorizer
-from transformers import AutoModel, AutoTokenizer
-from transformers.pipelines import pipeline
 from sklearn.pipeline import make_pipeline
 from sklearn.decomposition import TruncatedSVD
 from sklearn.feature_extraction.text import TfidfVectorizer
@@ -17,9 +19,13 @@ from umap import UMAP
 from torch import cuda, backends, version
 random_seed = 42
 # Check for torch cuda
 print("Is CUDA enabled? ", cuda.is_available())
 print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
 if cuda.is_available():
@@ -33,25 +39,19 @@ else:
 print("Device used is: ", torch_device)
-#os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
-from bertopic import BERTopic
-#from sentence_transformers import SentenceTransformer
-#from bertopic.backend._hftransformers import HFTransformerBackend
-#from cuml.manifold import UMAP
-#umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
 today = datetime.now().strftime("%d%m%Y")
 today_rev = datetime.now().strftime("%Y%m%d")
-from funcs.helper_functions import dummy_function, put_columns_in_df, read_file, get_file_path_end, zip_folder, delete_files_in_folder
 #from funcs.representation_model import representation_model
 from funcs.embeddings import make_or_load_embeddings
 # Log terminal output: https://github.com/gradio-app/gradio/issues/2362
 import sys
 class Logger:
@@ -78,89 +78,42 @@ def read_logs():
         return f.read()
 # Load embeddings
 # Pinning a Jina revision for security purposes: https://www.baseten.co/blog/pinning-ml-model-revisions-for-compatibility-and-security/
 # Save Jina model locally as described here: https://huggingface.co/jinaai/jina-embeddings-v2-base-en/discussions/29
-embeddings_name = "BAAI/bge-small-en-v1.5" #"jinaai/jina-embeddings-v2-base-en"
 # local_embeddings_location = "model/jina/"
 #revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
 #revision_choice = "69d43700292701b06c24f43b96560566a4e5ad1f"
 # Model used for representing topics
-hf_model_name =  'second-state/stablelm-2-zephyr-1.6b-GGUF' #'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF' # 'second-state/stablelm-2-zephyr-1.6b-GGUF'
-hf_model_file =   'stablelm-2-zephyr-1_6b-Q5_K_M.gguf' # 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
-def save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model, progress=gr.Progress()):
-        topic_dets = topic_model.get_topic_info()
-        if topic_dets.shape[0] == 1:
-            topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
-            topic_dets.to_csv(topic_det_output_name)
-            output_list.append(topic_det_output_name)
-            return output_list, "No topics found, original file returned"
-        progress(0.8, desc= "Saving output")
-        topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
-        topic_dets.to_csv(topic_det_output_name)
-        output_list.append(topic_det_output_name)
-        doc_det_output_name = "doc_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
-        doc_dets = topic_model.get_document_info(docs)[["Document",	"Topic", "Name", "Representative_document"]] # "Probability",
-        doc_dets.to_csv(doc_det_output_name)
-        output_list.append(doc_det_output_name)
-        topics_text_out_str = str(topic_dets["Name"])
-        output_text = "Topics: " + topics_text_out_str
-        # Save topic model to file
-        if save_topic_model == "Yes":
-            topic_model_save_name_pkl = "output_model/" + data_file_name_no_ext + "_topics_" + today_rev + ".pkl"# + ".safetensors"
-            topic_model_save_name_zip = topic_model_save_name_pkl + ".zip"
-            # Clear folder before replacing files
-            #delete_files_in_folder(topic_model_save_name_pkl)
-            topic_model.save(topic_model_save_name_pkl, serialization='pickle', save_embedding_model=False, save_ctfidf=False)
-            # Zip file example
-            #zip_folder(topic_model_save_name_pkl, topic_model_save_name_zip)
-            output_list.append(topic_model_save_name_pkl)
-        return output_list, output_text
-def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, save_topic_model, embeddings_out, zero_shot_similarity, progress=gr.Progress()):
     progress(0, desc= "Loading data")
-    if not in_colnames or not in_label:
-        error_message = "Please enter one column name for the topics and another for the labelling."
         print(error_message)
-        return error_message, None, None, embeddings_out
     all_tic = time.perf_counter()
     output_list = []
     file_list = [string.name for string in in_files]
-    data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
-    data_file_name = data_file_names[0]
-    data_file_name_no_ext = get_file_path_end(data_file_name)
     in_colnames_list_first = in_colnames[0]
-    if in_label:
-        in_label_list_first = in_label[0]
-    else:
-        in_label_list_first = in_colnames_list_first
-    # Make sure format of input series is good
-    data[in_colnames_list_first] = data[in_colnames_list_first].fillna('').astype(str)
-    data[in_label_list_first] = data[in_label_list_first].fillna('').astype(str)
-    label_list = list(data[in_label_list_first])
     if anonymise_drop == "Yes":
         progress(0.1, desc= "Anonymising data")
@@ -172,12 +125,11 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
         data.to_csv(anonymise_data_name)
         output_list.append(anonymise_data_name)
         anon_toc = time.perf_counter()
         time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
-    docs = list(data[in_colnames_list_first].str.lower())
     # Check if embeddings are being loaded in
     progress(0.2, desc= "Loading/creating embeddings")
@@ -185,10 +137,10 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
     if low_resource_mode == "No":
         print("Using high resource BGE transformer model")
         embedding_model = SentenceTransformer(embeddings_name)
         #try:
         #embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True,device_map="auto") # For Jina
         #except:
@@ -210,11 +162,15 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
         umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
-    embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, embeddings_out, embedding_model, embeddings_super_compress, low_resource_mode)
     vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
     progress(0.3, desc= "Embeddings loaded. Creating BERTopic model")
@@ -225,17 +181,18 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
                                 umap_model=umap_model,
                                 min_topic_size = min_docs_slider,
                                 nr_topics = max_topics_slider,
                                 verbose = True)
-        topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
-        if not topics_text:
-        # Handle the empty array case
-            return "No topics found.", data_file_name, None, embeddings_out, data_file_name_no_ext, topic_model, docs, label_list
-        else:
-            print("Topic model created.")
     # Do this if you have pre-defined topics
@@ -244,11 +201,13 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
             error_message = "Zero shot topic modelling currently not compatible with low-resource embeddings. Please change this option to 'No' on the options tab and retry."
             print(error_message)
-            return error_message, output_list, None, embeddings_out, data_file_name_no_ext, None, docs, label_list
         zero_shot_topics = read_file(candidate_topics.name)
         zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
         topic_model = BERTopic( embedding_model=embedding_model, #embedding_model_pipe, # for Jina
                                 vectorizer_model=vectoriser_model,
                                 umap_model=umap_model,
@@ -256,19 +215,51 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
                                 nr_topics = max_topics_slider,
                                 zeroshot_topic_list = zero_shot_topics_lower,
                                 zeroshot_min_similarity = zero_shot_similarity, # 0.7
                                 verbose = True)
-        topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
-       # print(topics_text)
-        if topics_text.size == 0:
-        # Handle the empty array case
-            return "No topics found.", data_file_name, None, embeddings_out, data_file_name_no_ext, topic_model, docs, label_list
-        else:
-            print("Topic model created.")
     # Outputs
     output_list, output_text = save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model)
@@ -292,37 +283,40 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
     time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
     print(time_out)
-    return output_text, output_list, None, embeddings_out, data_file_name_no_ext, topic_model, docs, label_list
-def reduce_outliers(topic_model, docs, embeddings_out, data_file_name_no_ext, low_resource_mode, create_llm_topic_labels, save_topic_model, progress=gr.Progress()):
-    #from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
-    from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
     output_list = []
     all_tic = time.perf_counter()
-    vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
-    topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
     #progress(0.2, desc= "Loading in representation model")
     #print("Create LLM topic labels:", create_llm_topic_labels)
     #representation_model = create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode)
     # Reduce outliers if required, then update representation
     progress(0.2, desc= "Reducing outliers")
     print("Reducing outliers.")
     # Calculate the c-TF-IDF representation for each outlier document and find the best matching c-TF-IDF topic representation using cosine similarity.
-    topics_text = topic_model.reduce_outliers(docs, topics_text, strategy="embeddings")
     # Then, update the topics to the ones that considered the new data
     print("Finished reducing outliers.")
-    progress(0.5, desc= "Creating topic representations")
-    print("Create LLM topic labels:", "No")
-    representation_model = create_representation_model("No", llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode)
-    topic_model.update_topics(docs, topics=topics_text, vectorizer_model=vectoriser_model, representation_model=representation_model)
     topic_dets = topic_model.get_topic_info()
@@ -334,15 +328,16 @@ def reduce_outliers(topic_model, docs, embeddings_out, data_file_name_no_ext, lo
         topic_model.set_topic_labels(list(topic_dets["Name"]))
     # Outputs
     output_list, output_text = save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model)
     all_toc = time.perf_counter()
     time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
     print(time_out)
-    return output_text, output_list, embeddings_out
-def represent_topics(topic_model, docs, embeddings_out, data_file_name_no_ext, low_resource_mode, save_topic_model, progress=gr.Progress()):
     #from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
     from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
@@ -352,48 +347,76 @@ def represent_topics(topic_model, docs, embeddings_out, data_file_name_no_ext, l
     vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
-    topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
     topic_dets = topic_model.get_topic_info()
-    progress(0.2, desc= "Creating topic representations")
     print("Create LLM topic labels:", "Yes")
     representation_model = create_representation_model("Yes", llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode)
-    topic_model.update_topics(docs, topics=topics_text, vectorizer_model=vectoriser_model, representation_model=representation_model)
     # Replace original labels with LLM labels
     if "LLM" in topic_model.get_topic_info().columns:
         llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["LLM"].values()]
         topic_model.set_topic_labels(llm_labels)
-        with open('llm_topic_list.csv', 'w') as file:
-            for item in llm_labels:
-                file.write(f"{item}\n")
-        output_list.append('llm_topic_list.csv')
     else:
         topic_model.set_topic_labels(list(topic_dets["Name"]))
-    # Outputs
     output_list, output_text = save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model)
     all_toc = time.perf_counter()
     time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
     print(time_out)
-    return output_text, output_list, embeddings_out
-def visualise_topics(topic_model, docs, data_file_name_no_ext, low_resource_mode,  embeddings_out, label_list, sample_prop, visualisation_type_radio, progress=gr.Progress()):
     output_list = []
     vis_tic = time.perf_counter()
-    from funcs.bertopic_vis_documents import visualize_documents_custom
     topic_dets = topic_model.get_topic_info()
-    # Replace original labels with LLM labels
     if "LLM" in topic_model.get_topic_info().columns:
         llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["LLM"].values()]
         topic_model.set_topic_labels(llm_labels)
@@ -414,16 +437,37 @@ def visualise_topics(topic_model, docs, data_file_name_no_ext, low_resource_mode
     # "Topic document graph", "Hierarchical view"
     if visualisation_type_radio == "Topic document graph":
-        topics_vis = visualize_documents_custom(topic_model, docs, hover_labels = label_list, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True, sample = sample_prop)
-        topics_vis_name = data_file_name_no_ext + '_' + 'visualisation_' + today_rev + '.html'
         topics_vis.write_html(topics_vis_name)
         output_list.append(topics_vis_name)
     elif visualisation_type_radio == "Hierarchical view":
         hierarchical_topics = topic_model.hierarchical_topics(docs)
-        topics_vis = topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings, sample = sample_prop)
-        topics_vis_2 = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
         topics_vis_name = data_file_name_no_ext + '_' + 'vis_hierarchy_topic_doc_' + today_rev + '.html'
         topics_vis.write_html(topics_vis_name)
@@ -433,24 +477,22 @@ def visualise_topics(topic_model, docs, data_file_name_no_ext, low_resource_mode
         topics_vis_2.write_html(topics_vis_2_name)
         output_list.append(topics_vis_2_name)
-        # Save new hierarchical topic model to file
-        import pandas as pd
-        hierarchical_topics_name = data_file_name_no_ext + '_' + 'vis_hierarchy_topics' + today_rev + '.csv'
-        hierarchical_topics.to_csv(hierarchical_topics_name)
-        output_list.append(hierarchical_topics_name)
-        #output_list, output_text = save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model)
     all_toc = time.perf_counter()
     time_out = f"Creating visualisation took {all_toc - vis_tic:0.1f} seconds"
     print(time_out)
-    return time_out, output_list, topics_vis, embeddings_out
-def save_as_pytorch_model(topic_model, docs, data_file_name_no_ext , progress=gr.Progress()):
     output_list = []
     topic_model_save_name_folder = "output_model/" + data_file_name_no_ext + "_topics_" + today_rev# + ".safetensors"
     topic_model_save_name_zip = topic_model_save_name_folder + ".zip"
@@ -464,6 +506,8 @@ def save_as_pytorch_model(topic_model, docs, data_file_name_no_ext , progress=gr
     zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
     output_list.append(topic_model_save_name_zip)
 # Gradio app
 block = gr.Blocks(theme = gr.themes.Base())
@@ -475,7 +519,7 @@ with block:
     topic_model_state = gr.State()
     docs_state = gr.State()
     data_file_name_no_ext_state = gr.State()
-    label_list_state = gr.State()
     gr.Markdown(
     """
@@ -489,8 +533,7 @@ with block:
         with gr.Accordion("Load data file", open = True):
             in_files = gr.File(label="Input text from file", file_count="multiple")
             with gr.Row():
-                in_colnames = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column to find topics (first will be chosen if multiple selected).")
-                in_label = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column for labelling documents in the output visualisation.")
         with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
             candidate_topics = gr.File(label="Input topics from file (csv). File should have at least one column with a header and topic keywords in cells below. Topics will be taken from the first column of the file. Currently not compatible with low-resource embeddings.")
@@ -511,41 +554,48 @@ with block:
             with gr.Row():
                 reduce_outliers_btn = gr.Button("Reduce outliers")
                 represent_llm_btn = gr.Button("Generate topic labels with LLMs")
         #logs = gr.Textbox(label="Processing logs.")
     with gr.Tab("Visualise"):
-        sample_slide = gr.Slider(minimum = 0.01, maximum = 1, value = 0.1, step = 0.01, label = "Proportion of data points to show on output visualisation.")
-        visualisation_type_radio = gr.Radio(choices=["Topic document graph", "Hierarchical view"])
         plot_btn = gr.Button("Visualise topic model")
-        out_plot_file = gr.File(label="Output plots to file", file_count="multiple")
-        plot = gr.Plot(label="Visualise your topics here. Go to the 'Options' tab to enable.")
     with gr.Tab("Options"):
         with gr.Accordion("Data load and processing options", open = True):
             with gr.Row():
                 anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
                 embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
-                #create_llm_topic_labels = gr.Dropdown(label = "Create topic labels based on LLMs.", value="No", choices=["Yes", "No"])
             with gr.Row():
                 low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings and processing.", value="No", choices=["Yes", "No"])
-                return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="Yes", choices=["Yes", "No"])
                 save_topic_model = gr.Dropdown(label = "Save topic model to file.", value="Yes", choices=["Yes", "No"])
     # Update column names dropdown when file uploaded
-    in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state, embeddings_state, output_single_text, topic_model_state])
     in_colnames.change(dummy_function, in_colnames, None)
-    topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state, zero_shot_similarity], outputs=[output_single_text, output_file, plot, embeddings_state, data_file_name_no_ext_state, topic_model_state, docs_state, label_list_state], api_name="topics")
-    reduce_outliers_btn.click(fn=reduce_outliers, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, low_resource_mode_opt], outputs=[output_single_text, output_file, embeddings_state], api_name="reduce_outliers")
-    represent_llm_btn.click(fn=represent_topics, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, low_resource_mode_opt], outputs=[output_single_text, output_file, embeddings_state], api_name="represent_llm")
-    plot_btn.click(fn=visualise_topics, inputs=[topic_model_state, docs_state, data_file_name_no_ext_state, low_resource_mode_opt, embeddings_state, label_list_state, sample_slide, visualisation_type_radio], outputs=[output_single_text, out_plot_file, plot], api_name="plot")
     #block.load(read_logs, None, logs, every=5)

 import os
+# Dendrograms will not work with the latest version of scipy (1.12.0), so installing the version prior to be safe
+os.system("pip install scipy==1.11.4")
 import gradio as gr
 from datetime import datetime
 import pandas as pd
 from sentence_transformers import SentenceTransformer
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.pipeline import make_pipeline
 from sklearn.decomposition import TruncatedSVD
 from sklearn.feature_extraction.text import TfidfVectorizer
 from torch import cuda, backends, version
+# Default seed, can be changed in number selection on options page
 random_seed = 42
 # Check for torch cuda
+# If you want to disable cuda for testing purposes
+#os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
 print("Is CUDA enabled? ", cuda.is_available())
 print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
 if cuda.is_available():
 print("Device used is: ", torch_device)
+from bertopic import BERTopic
 today = datetime.now().strftime("%d%m%Y")
 today_rev = datetime.now().strftime("%Y%m%d")
+from funcs.helper_functions import dummy_function, initial_file_load, read_file, zip_folder, delete_files_in_folder, save_topic_outputs
 #from funcs.representation_model import representation_model
 from funcs.embeddings import make_or_load_embeddings
 # Log terminal output: https://github.com/gradio-app/gradio/issues/2362
 import sys
 class Logger:
         return f.read()
 # Load embeddings
+embeddings_name = "BAAI/bge-small-en-v1.5" #"jinaai/jina-embeddings-v2-base-en"
+# Use of Jina deprecated - kept here for posterity
 # Pinning a Jina revision for security purposes: https://www.baseten.co/blog/pinning-ml-model-revisions-for-compatibility-and-security/
 # Save Jina model locally as described here: https://huggingface.co/jinaai/jina-embeddings-v2-base-en/discussions/29
 # local_embeddings_location = "model/jina/"
 #revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
 #revision_choice = "69d43700292701b06c24f43b96560566a4e5ad1f"
 # Model used for representing topics
+hf_model_name =  'second-state/stablelm-2-zephyr-1.6b-GGUF' #'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF'
+hf_model_file =   'stablelm-2-zephyr-1_6b-Q5_K_M.gguf' # 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf'
+def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext, custom_labels_df, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, save_topic_model, embeddings_out, zero_shot_similarity, random_seed, calc_probs, progress=gr.Progress(track_tqdm=True)):
     progress(0, desc= "Loading data")
+    if calc_probs == "No":
+        calc_probs = False
+    elif calc_probs == "Yes":
+        print("Calculating all probabilities.")
+        calc_probs == True
+    if not in_colnames:
+        error_message = "Please enter one column name to use to find topics."
         print(error_message)
+        return error_message, None, embeddings_out, data_file_name_no_ext, None, None
     all_tic = time.perf_counter()
     output_list = []
     file_list = [string.name for string in in_files]
     in_colnames_list_first = in_colnames[0]
+    docs = list(data[in_colnames_list_first].str.lower())
     if anonymise_drop == "Yes":
         progress(0.1, desc= "Anonymising data")
         data.to_csv(anonymise_data_name)
         output_list.append(anonymise_data_name)
+        print(anonymisation_success)
         anon_toc = time.perf_counter()
         time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
     # Check if embeddings are being loaded in
     progress(0.2, desc= "Loading/creating embeddings")
     if low_resource_mode == "No":
         print("Using high resource BGE transformer model")
         embedding_model = SentenceTransformer(embeddings_name)
+        # Use of Jina now superseded by BGE, keeping this code just in case I consider reverting one day
         #try:
         #embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True,device_map="auto") # For Jina
         #except:
         umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
+    embeddings_out = make_or_load_embeddings(docs, file_list, embeddings_out, embedding_model, embeddings_super_compress, low_resource_mode)
     vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
+    # Representation model not currently used in this function
+    #print("Create Keybert-like topic representations by default")
+    #from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
+    #representation_model = create_representation_model("No", llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode)
     progress(0.3, desc= "Embeddings loaded. Creating BERTopic model")
                                 umap_model=umap_model,
                                 min_topic_size = min_docs_slider,
                                 nr_topics = max_topics_slider,
+                                calculate_probabilities=calc_probs,
+                                #representation_model=representation_model,
                                 verbose = True)
+        assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
+        #print(assigned_topics)
+        # Replace original labels with Keybert labels
+        #if "KeyBERT" in topic_model.get_topic_info().columns:
+        #    keybert_labels = [f"{i+1}: {', '.join(entry[:5])}" for i, entry in enumerate(topic_model.get_topics(full=True)["KeyBERT"].values())]
+        #    topic_model.set_topic_labels(keybert_labels)
     # Do this if you have pre-defined topics
             error_message = "Zero shot topic modelling currently not compatible with low-resource embeddings. Please change this option to 'No' on the options tab and retry."
             print(error_message)
+            return error_message, output_list, embeddings_out, data_file_name_no_ext, None, docs
         zero_shot_topics = read_file(candidate_topics.name)
         zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
         topic_model = BERTopic( embedding_model=embedding_model, #embedding_model_pipe, # for Jina
                                 vectorizer_model=vectoriser_model,
                                 umap_model=umap_model,
                                 nr_topics = max_topics_slider,
                                 zeroshot_topic_list = zero_shot_topics_lower,
                                 zeroshot_min_similarity = zero_shot_similarity, # 0.7
+                                calculate_probabilities=calc_probs,
+                                #representation_model=representation_model,
                                 verbose = True)
+        assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
+        # For some reason, zero topic modelling exports assigned topics as a np.array instead of a list. Converting it back here.
+        if isinstance(assigned_topics, np.ndarray):
+            assigned_topics = assigned_topics.tolist()
+        #print(assigned_topics.tolist())
+         # Zero shot modelling is a model merge, which wipes the c_tf_idf part of the resulting model completely. To get hierarchical modelling to work, we need to recreate this part of the model with the CountVectorizer options used to create the initial model. Since with zero shot, we are merging two models that have exactly the same set of documents, the vocubulary should be the same, and so recreating the cf_tf_idf component in this way shouldn't be a problem. Discussion here, and below based on Maarten's suggested code: https://github.com/MaartenGr/BERTopic/issues/1700
+        doc_dets = topic_model.get_document_info(docs)
+        documents_per_topic = doc_dets.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
+        # Assign CountVectorizer to merged model
+        topic_model.vectorizer_model = vectoriser_model
+        # Re-calculate c-TF-IDF
+        c_tf_idf, _ = topic_model._c_tf_idf(documents_per_topic)
+        topic_model.c_tf_idf_ = c_tf_idf
+        # Replace original labels with Keybert labels
+        #if "KeyBERT" in topic_model.get_topic_info().columns:
+        #    print(topic_model.get_topics(full=True)["KeyBERT"].values())
+        #    keybert_labels = [f"{i+1}: {', '.join(entry[:5])}" for i, entry in enumerate(topic_model.get_topics(full=True)["KeyBERT"].values())]
+        #    topic_model.set_topic_labels(keybert_labels)
+    if not assigned_topics:
+    # Handle the empty array case
+        return "No topics found.", output_list, embeddings_out, data_file_name_no_ext, topic_model, docs
+    else:
+        print("Topic model created.")
+    if not custom_labels_df.empty:
+        #print(custom_labels_df.shape)
+        #topic_dets = topic_model.get_topic_info()
+        #print(topic_dets.shape)
+        topic_model.set_topic_labels(list(custom_labels_df.iloc[:,0]))
     # Outputs
     output_list, output_text = save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model)
     time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
     print(time_out)
+    return output_text, output_list, embeddings_out, data_file_name_no_ext, topic_model, docs
+def reduce_outliers(topic_model, docs, embeddings_out, data_file_name_no_ext, save_topic_model, progress=gr.Progress(track_tqdm=True)):
+    progress(0, desc= "Preparing data")
     output_list = []
     all_tic = time.perf_counter()
+    assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
+    if isinstance(assigned_topics, np.ndarray):
+        assigned_topics = assigned_topics.tolist()
     #progress(0.2, desc= "Loading in representation model")
     #print("Create LLM topic labels:", create_llm_topic_labels)
+    #from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
     #representation_model = create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode)
     # Reduce outliers if required, then update representation
     progress(0.2, desc= "Reducing outliers")
     print("Reducing outliers.")
     # Calculate the c-TF-IDF representation for each outlier document and find the best matching c-TF-IDF topic representation using cosine similarity.
+    assigned_topics = topic_model.reduce_outliers(docs, assigned_topics, strategy="embeddings")
     # Then, update the topics to the ones that considered the new data
     print("Finished reducing outliers.")
+    progress(0.7, desc= "Replacing topic names with LLMs if necessary")
+    #print("Create LLM topic labels:", "No")
+    #vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
+    #representation_model = create_representation_model("No", llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode)
+    #topic_model.update_topics(docs, topics=assigned_topics, vectorizer_model=vectoriser_model, representation_model=representation_model)
     topic_dets = topic_model.get_topic_info()
         topic_model.set_topic_labels(list(topic_dets["Name"]))
     # Outputs
+    progress(0.9, desc= "Saving to file")
     output_list, output_text = save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model)
     all_toc = time.perf_counter()
     time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
     print(time_out)
+    return output_text, output_list, topic_model
+def represent_topics(topic_model, docs, embeddings_out, data_file_name_no_ext, low_resource_mode, save_topic_model, progress=gr.Progress(track_tqdm=True)):
     #from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
     from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
     vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
+    assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
     topic_dets = topic_model.get_topic_info()
+    progress(0.1, desc= "Loading LLM model")
     print("Create LLM topic labels:", "Yes")
     representation_model = create_representation_model("Yes", llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode)
+    topic_model.update_topics(docs, topics=assigned_topics, vectorizer_model=vectoriser_model, representation_model=representation_model)
     # Replace original labels with LLM labels
     if "LLM" in topic_model.get_topic_info().columns:
         llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["LLM"].values()]
         topic_model.set_topic_labels(llm_labels)
+        label_list_file_name = data_file_name_no_ext + '_llm_topic_list_' + today_rev + '.csv'
+        llm_labels_df = pd.DataFrame(data={"Label":llm_labels})
+        llm_labels_df.to_csv(label_list_file_name, index=None)
+        #with open(label_list_file_name, 'w') as file:
+        #    file.write(f"Label\n")
+        #    for item in llm_labels:
+        #        file.write(f"{item}\n")
+        output_list.append(label_list_file_name)
     else:
         topic_model.set_topic_labels(list(topic_dets["Name"]))
+    # Outputs
+    progress(0.8, desc= "Saving outputs")
     output_list, output_text = save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model)
     all_toc = time.perf_counter()
     time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
     print(time_out)
+    return output_text, output_list, topic_model
+def visualise_topics(topic_model, data, data_file_name_no_ext, low_resource_mode,  embeddings_out, in_label, in_colnames, sample_prop, visualisation_type_radio, random_seed, progress=gr.Progress()):
+    progress(0, desc= "Preparing data for visualisation")
     output_list = []
     vis_tic = time.perf_counter()
+    from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, visualize_barchart_custom
+    if not visualisation_type_radio:
+        return "Please choose a visualisation type above.", output_list, None, None
+    # Get topic labels
+    if in_label:
+       in_label_list_first = in_label[0]
+    else:
+       return "Label column not found. Please enter this above.", output_list, None, None
+    # Get docs
+    if in_colnames:
+        in_colnames_list_first = in_colnames[0]
+    else:
+        return "Label column not found. Please enter this on the data load tab.", output_list, None, None
+    docs = list(data[in_colnames_list_first].str.lower())
+    # Make sure format of input series is good
+    data[in_label_list_first] = data[in_label_list_first].fillna('').astype(str)
+    label_list = list(data[in_label_list_first])
     topic_dets = topic_model.get_topic_info()
+    # Replace original labels with LLM labels if they exist, or go with the 'Name' column
     if "LLM" in topic_model.get_topic_info().columns:
         llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["LLM"].values()]
         topic_model.set_topic_labels(llm_labels)
     # "Topic document graph", "Hierarchical view"
     if visualisation_type_radio == "Topic document graph":
+        topics_vis = visualize_documents_custom(topic_model, docs, hover_labels = label_list, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True, sample = sample_prop, width= 1200, height = 750)
+        topics_vis_name = data_file_name_no_ext + '_' + 'vis_topic_docs_' + today_rev + '.html'
         topics_vis.write_html(topics_vis_name)
         output_list.append(topics_vis_name)
+        topics_vis_2 = visualize_barchart_custom(topic_model, top_n_topics = 12, custom_labels=True, width= 300, height = 250)
+        topics_vis_2_name = data_file_name_no_ext + '_' + 'vis_barchart_' + today_rev + '.html'
+        topics_vis_2.write_html(topics_vis_2_name)
+        output_list.append(topics_vis_2_name)
     elif visualisation_type_radio == "Hierarchical view":
+        # Check that original topics are retained
+        #new_topic_dets = topic_model.get_topic_info()
+        #new_topic_dets.to_csv("new_topic_dets.csv")
+        #from funcs.bertopic_hierarchical_topics_mod import hierarchical_topics_mod
         hierarchical_topics = topic_model.hierarchical_topics(docs)
+        # Save new hierarchical topic model to file
+        hierarchical_topics_name = data_file_name_no_ext + '_' + 'vis_hierarchy_topics_' + today_rev + '.csv'
+        hierarchical_topics.to_csv(hierarchical_topics_name)
+        output_list.append(hierarchical_topics_name)
+        #hierarchical_topics = hierarchical_topics_mod(topic_model, docs)
+        topics_vis = visualize_hierarchical_documents_custom(topic_model, docs, label_list, hierarchical_topics, reduced_embeddings=reduced_embeddings, sample = sample_prop, hide_document_hover= False, custom_labels=True, width= 1200, height = 750)
+        #topics_vis = topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings, sample = sample_prop, hide_document_hover= False, custom_labels=True, width= 1200, height = 750)
+        topics_vis_2 = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics, width= 1200, height = 750)
         topics_vis_name = data_file_name_no_ext + '_' + 'vis_hierarchy_topic_doc_' + today_rev + '.html'
         topics_vis.write_html(topics_vis_name)
         topics_vis_2.write_html(topics_vis_2_name)
         output_list.append(topics_vis_2_name)
     all_toc = time.perf_counter()
     time_out = f"Creating visualisation took {all_toc - vis_tic:0.1f} seconds"
     print(time_out)
+    return time_out, output_list, topics_vis, topics_vis_2
+def save_as_pytorch_model(topic_model, data_file_name_no_ext , progress=gr.Progress()):
+    if not topic_model:
+        return "No Pytorch model found.", None
+    progress(0, desc= "Saving topic model in Pytorch format")
     output_list = []
     topic_model_save_name_folder = "output_model/" + data_file_name_no_ext + "_topics_" + today_rev# + ".safetensors"
     topic_model_save_name_zip = topic_model_save_name_folder + ".zip"
     zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
     output_list.append(topic_model_save_name_zip)
+    return "Model saved in Pytorch format.", output_list
 # Gradio app
 block = gr.Blocks(theme = gr.themes.Base())
     topic_model_state = gr.State()
     docs_state = gr.State()
     data_file_name_no_ext_state = gr.State()
+    label_list_state = gr.State(pd.DataFrame())
     gr.Markdown(
     """
         with gr.Accordion("Load data file", open = True):
             in_files = gr.File(label="Input text from file", file_count="multiple")
             with gr.Row():
+                in_colnames = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column to find topics (first will be chosen if multiple selected).")
         with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
             candidate_topics = gr.File(label="Input topics from file (csv). File should have at least one column with a header and topic keywords in cells below. Topics will be taken from the first column of the file. Currently not compatible with low-resource embeddings.")
             with gr.Row():
                 reduce_outliers_btn = gr.Button("Reduce outliers")
                 represent_llm_btn = gr.Button("Generate topic labels with LLMs")
+                save_pytorch_btn = gr.Button("Save model in Pytorch format")
         #logs = gr.Textbox(label="Processing logs.")
     with gr.Tab("Visualise"):
+        with gr.Row():
+            in_label = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column for labelling documents in output visualisations.")
+            visualisation_type_radio = gr.Radio(label="Visualisation type", choices=["Topic document graph", "Hierarchical view"])
+        sample_slide = gr.Slider(minimum = 0.01, maximum = 1, value = 0.1, step = 0.01, label = "Proportion of data points to show on output visualisations.")
         plot_btn = gr.Button("Visualise topic model")
+        with gr.Row():
+            vis_output_single_text = gr.Textbox(label="Visualisation output text")
+            out_plot_file = gr.File(label="Output plots to file", file_count="multiple")
+        plot = gr.Plot(label="Visualise your topics here.")
+        plot_2 = gr.Plot(label="Visualise your topics here.")
     with gr.Tab("Options"):
         with gr.Accordion("Data load and processing options", open = True):
             with gr.Row():
                 anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
                 embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
+                seed_number = gr.Number(label="Random seed to use for dimensionality reduction.", minimum=0, step=1, value=42, precision=0)
+                calc_probs = gr.Dropdown(label="Calculate all topic probabilities (i.e. a separate document prob. value for each topic)", value="No", choices=["Yes", "No"])
             with gr.Row():
                 low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings and processing.", value="No", choices=["Yes", "No"])
+                return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation.", value="Yes", choices=["Yes", "No"])
                 save_topic_model = gr.Dropdown(label = "Save topic model to file.", value="Yes", choices=["Yes", "No"])
     # Update column names dropdown when file uploaded
+    in_files.upload(fn=initial_file_load, inputs=[in_files], outputs=[in_colnames, in_label, data_state, output_single_text, topic_model_state, embeddings_state, data_file_name_no_ext_state, label_list_state])
     in_colnames.change(dummy_function, in_colnames, None)
+    topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state, zero_shot_similarity, seed_number, calc_probs], outputs=[output_single_text, output_file, embeddings_state, data_file_name_no_ext_state, topic_model_state, docs_state], api_name="topics")
+    reduce_outliers_btn.click(fn=reduce_outliers, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, save_topic_model], outputs=[output_single_text, output_file, topic_model_state], api_name="reduce_outliers")
+    represent_llm_btn.click(fn=represent_topics, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, low_resource_mode_opt, save_topic_model], outputs=[output_single_text, output_file, topic_model_state], api_name="represent_llm")
+    save_pytorch_btn.click(fn=save_as_pytorch_model, inputs=[topic_model_state, data_file_name_no_ext_state], outputs=[output_single_text, output_file])
+    plot_btn.click(fn=visualise_topics, inputs=[topic_model_state, data_state, data_file_name_no_ext_state, low_resource_mode_opt, embeddings_state, in_label, in_colnames, sample_slide, visualisation_type_radio, seed_number], outputs=[vis_output_single_text, out_plot_file, plot, plot_2], api_name="plot")
     #block.load(read_logs, None, logs, every=5)

funcs/anonymiser.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from spacy.cli import download
 import spacy
 spacy.prefer_gpu()
-import os
 def spacy_model_installed(model_name):
     try:

 from spacy.cli import download
 import spacy
 spacy.prefer_gpu()
 def spacy_model_installed(model_name):
     try:

funcs/bertopic_vis_documents.py CHANGED Viewed

@@ -1,10 +1,14 @@
 import numpy as np
 import pandas as pd
 import plotly.graph_objects as go
 from umap import UMAP
 from typing import List, Union
 # Shamelessly taken and adapted from Bertopic original implementation here (Maarten Grootendorst): https://github.com/MaartenGr/BERTopic/blob/master/bertopic/plotting/_documents.py
 def visualize_documents_custom(topic_model,
@@ -243,3 +247,469 @@ def visualize_documents_custom(topic_model,
     fig.update_xaxes(visible=False)
     fig.update_yaxes(visible=False)
     return fig

 import numpy as np
 import pandas as pd
 import plotly.graph_objects as go
+from plotly.subplots import make_subplots
 from umap import UMAP
 from typing import List, Union
+import itertools
+import numpy as np
 # Shamelessly taken and adapted from Bertopic original implementation here (Maarten Grootendorst): https://github.com/MaartenGr/BERTopic/blob/master/bertopic/plotting/_documents.py
 def visualize_documents_custom(topic_model,
     fig.update_xaxes(visible=False)
     fig.update_yaxes(visible=False)
     return fig
+def visualize_hierarchical_documents_custom(topic_model,
+                                     docs: List[str],
+                                     hover_labels: List[str],
+                                     hierarchical_topics: pd.DataFrame,
+                                     topics: List[int] = None,
+                                     embeddings: np.ndarray = None,
+                                     reduced_embeddings: np.ndarray = None,
+                                     sample: Union[float, int] = None,
+                                     hide_annotations: bool = False,
+                                     hide_document_hover: bool = True,
+                                     nr_levels: int = 10,
+                                     level_scale: str = 'linear',
+                                     custom_labels: Union[bool, str] = False,
+                                     title: str = "<b>Hierarchical Documents and Topics</b>",
+                                     width: int = 1200,
+                                     height: int = 750) -> go.Figure:
+    """ Visualize documents and their topics in 2D at different levels of hierarchy
+    Arguments:
+        docs: The documents you used when calling either `fit` or `fit_transform`
+        hierarchical_topics: A dataframe that contains a hierarchy of topics
+                             represented by their parents and their children
+        topics: A selection of topics to visualize.
+                Not to be confused with the topics that you get from `.fit_transform`.
+                For example, if you want to visualize only topics 1 through 5:
+                `topics = [1, 2, 3, 4, 5]`.
+        embeddings: The embeddings of all documents in `docs`.
+        reduced_embeddings: The 2D reduced embeddings of all documents in `docs`.
+        sample: The percentage of documents in each topic that you would like to keep.
+                Value can be between 0 and 1. Setting this value to, for example,
+                0.1 (10% of documents in each topic) makes it easier to visualize
+                millions of documents as a subset is chosen.
+        hide_annotations: Hide the names of the traces on top of each cluster.
+        hide_document_hover: Hide the content of the documents when hovering over
+                             specific points. Helps to speed up generation of visualizations.
+        nr_levels: The number of levels to be visualized in the hierarchy. First, the distances
+                   in `hierarchical_topics.Distance` are split in `nr_levels` lists of distances.
+                   Then, for each list of distances, the merged topics are selected that have a
+                   distance less or equal to the maximum distance of the selected list of distances.
+                   NOTE: To get all possible merged steps, make sure that `nr_levels` is equal to
+                   the length of `hierarchical_topics`.
+        level_scale: Whether to apply a linear or logarithmic (log) scale levels of the distance
+                     vector. Linear scaling will perform an equal number of merges at each level
+                     while logarithmic scaling will perform more mergers in earlier levels to
+                     provide more resolution at higher levels (this can be used for when the number
+                     of topics is large).
+        custom_labels: If bool, whether to use custom topic labels that were defined using
+                       `topic_model.set_topic_labels`.
+                       If `str`, it uses labels from other aspects, e.g., "Aspect1".
+                       NOTE: Custom labels are only generated for the original
+                       un-merged topics.
+        title: Title of the plot.
+        width: The width of the figure.
+        height: The height of the figure.
+    Examples:
+    To visualize the topics simply run:
+    ```python
+    topic_model.visualize_hierarchical_documents(docs, hierarchical_topics)
+    ```
+    Do note that this re-calculates the embeddings and reduces them to 2D.
+    The advised and prefered pipeline for using this function is as follows:
+    ```python
+    from sklearn.datasets import fetch_20newsgroups
+    from sentence_transformers import SentenceTransformer
+    from bertopic import BERTopic
+    from umap import UMAP
+    # Prepare embeddings
+    docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']
+    sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
+    embeddings = sentence_model.encode(docs, show_progress_bar=False)
+    # Train BERTopic and extract hierarchical topics
+    topic_model = BERTopic().fit(docs, embeddings)
+    hierarchical_topics = topic_model.hierarchical_topics(docs)
+    # Reduce dimensionality of embeddings, this step is optional
+    # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
+    # Run the visualization with the original embeddings
+    topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings)
+    # Or, if you have reduced the original embeddings already:
+    topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)
+    ```
+    Or if you want to save the resulting figure:
+    ```python
+    fig = topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)
+    fig.write_html("path/to/file.html")
+    ```
+    NOTE:
+        This visualization was inspired by the scatter plot representation of Doc2Map:
+        https://github.com/louisgeisler/Doc2Map
+    <iframe src="../../getting_started/visualization/hierarchical_documents.html"
+    style="width:1000px; height: 770px; border: 0px;""></iframe>
+    """
+    topic_per_doc = topic_model.topics_
+    # Add <br> tags to hover labels to get them to appear on multiple lines
+    def wrap_by_word(s, n):
+        '''returns a string up to 300 words where \\n is inserted between every n words'''
+        a = s.split()[:300]
+        ret = ''
+        for i in range(0, len(a), n):
+            ret += ' '.join(a[i:i+n]) + '<br>'
+        return ret
+    # Apply the function to every element in the list
+    hover_labels = [wrap_by_word(s, n=20) for s in hover_labels]
+    # Sample the data to optimize for visualization and dimensionality reduction
+    if sample is None or sample > 1:
+        sample = 1
+    indices = []
+    for topic in set(topic_per_doc):
+        s = np.where(np.array(topic_per_doc) == topic)[0]
+        size = len(s) if len(s) < 100 else int(len(s)*sample)
+        indices.extend(np.random.choice(s, size=size, replace=False))
+    indices = np.array(indices)
+    df = pd.DataFrame({"topic": np.array(topic_per_doc)[indices]})
+    df["doc"] = [docs[index] for index in indices]
+    df["hover_labels"] = [hover_labels[index] for index in indices]
+    df["topic"] = [topic_per_doc[index] for index in indices]
+    # Extract embeddings if not already done
+    if sample is None:
+        if embeddings is None and reduced_embeddings is None:
+            embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document")
+        else:
+            embeddings_to_reduce = embeddings
+    else:
+        if embeddings is not None:
+            embeddings_to_reduce = embeddings[indices]
+        elif embeddings is None and reduced_embeddings is None:
+            embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document")
+    # Reduce input embeddings
+    if reduced_embeddings is None:
+        umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit(embeddings_to_reduce)
+        embeddings_2d = umap_model.embedding_
+    elif sample is not None and reduced_embeddings is not None:
+        embeddings_2d = reduced_embeddings[indices]
+    elif sample is None and reduced_embeddings is not None:
+        embeddings_2d = reduced_embeddings
+    # Combine data
+    df["x"] = embeddings_2d[:, 0]
+    df["y"] = embeddings_2d[:, 1]
+    # Create topic list for each level, levels are created by calculating the distance
+    distances = hierarchical_topics.Distance.to_list()
+    if level_scale == 'log' or level_scale == 'logarithmic':
+        log_indices = np.round(np.logspace(start=math.log(1,10), stop=math.log(len(distances)-1,10), num=nr_levels)).astype(int).tolist()
+        log_indices.reverse()
+        max_distances = [distances[i] for i in log_indices]
+    elif level_scale == 'lin' or level_scale == 'linear':
+        max_distances = [distances[indices[-1]] for indices in np.array_split(range(len(hierarchical_topics)), nr_levels)][::-1]
+    else:
+        raise ValueError("level_scale needs to be one of 'log' or 'linear'")
+    for index, max_distance in enumerate(max_distances):
+        # Get topics below `max_distance`
+        mapping = {topic: topic for topic in df.topic.unique()}
+        selection = hierarchical_topics.loc[hierarchical_topics.Distance <= max_distance, :]
+        selection.Parent_ID = selection.Parent_ID.astype(int)
+        selection = selection.sort_values("Parent_ID")
+        for row in selection.iterrows():
+            for topic in row[1].Topics:
+                mapping[topic] = row[1].Parent_ID
+        # Make sure the mappings are mapped 1:1
+        mappings = [True for _ in mapping]
+        while any(mappings):
+            for i, (key, value) in enumerate(mapping.items()):
+                if value in mapping.keys() and key != value:
+                    mapping[key] = mapping[value]
+                else:
+                    mappings[i] = False
+        # Create new column
+        df[f"level_{index+1}"] = df.topic.map(mapping)
+        df[f"level_{index+1}"] = df[f"level_{index+1}"].astype(int)
+    # Prepare topic names of original and merged topics
+    trace_names = []
+    topic_names = {}
+    for topic in range(hierarchical_topics.Parent_ID.astype(int).max()):
+        if topic < hierarchical_topics.Parent_ID.astype(int).min():
+            if topic_model.get_topic(topic):
+                if isinstance(custom_labels, str):
+                    trace_name = f"{topic}_" + "_".join(list(zip(*topic_model.topic_aspects_[custom_labels][topic]))[0][:3])
+                elif topic_model.custom_labels_ is not None and custom_labels:
+                    trace_name = topic_model.custom_labels_[topic + topic_model._outliers]
+                else:
+                    trace_name = f"{topic}_" + "_".join([word[:20] for word, _ in topic_model.get_topic(topic)][:3])
+                topic_names[topic] = {"trace_name": trace_name[:40], "plot_text": trace_name[:40]}
+                trace_names.append(trace_name)
+        else:
+            trace_name = f"{topic}_" + hierarchical_topics.loc[hierarchical_topics.Parent_ID == str(topic), "Parent_Name"].values[0]
+            plot_text = "_".join([name[:20] for name in trace_name.split("_")[:3]])
+            topic_names[topic] = {"trace_name": trace_name[:40], "plot_text": plot_text[:40]}
+            trace_names.append(trace_name)
+    # Prepare traces
+    all_traces = []
+    for level in range(len(max_distances)):
+        traces = []
+        # Outliers
+        if topic_model._outliers:
+            traces.append(
+                    go.Scattergl(
+                        x=df.loc[(df[f"level_{level+1}"] == -1), "x"],
+                        y=df.loc[df[f"level_{level+1}"] == -1, "y"],
+                        mode='markers+text',
+                        name="other",
+                        hoverinfo="text",
+                        hovertext=df.loc[(df[f"level_{level+1}"] == -1), "hover_labels"] if not hide_document_hover else None,
+                        showlegend=False,
+                        marker=dict(color='#CFD8DC', size=5, opacity=0.5),
+                        hoverlabel=dict(align='left')
+                    )
+                )
+        # Selected topics
+        if topics:
+            selection = df.loc[(df.topic.isin(topics)), :]
+            unique_topics = sorted([int(topic) for topic in selection[f"level_{level+1}"].unique()])
+        else:
+            unique_topics = sorted([int(topic) for topic in df[f"level_{level+1}"].unique()])
+        for topic in unique_topics:
+            if topic != -1:
+                if topics:
+                    selection = df.loc[(df[f"level_{level+1}"] == topic) &
+                                       (df.topic.isin(topics)), :]
+                else:
+                    selection = df.loc[df[f"level_{level+1}"] == topic, :]
+                if not hide_annotations:
+                    selection.loc[len(selection), :] = None
+                    selection["text"] = ""
+                    selection.loc[len(selection) - 1, "x"] = selection.x.mean()
+                    selection.loc[len(selection) - 1, "y"] = selection.y.mean()
+                    selection.loc[len(selection) - 1, "text"] = topic_names[int(topic)]["plot_text"]
+                traces.append(
+                    go.Scattergl(
+                        x=selection.x,
+                        y=selection.y,
+                        text=selection.text if not hide_annotations else None,
+                        hovertext=selection.hover_labels if not hide_document_hover else None,
+                        hoverinfo="text",
+                        name=topic_names[int(topic)]["trace_name"],
+                        mode='markers+text',
+                        marker=dict(size=5, opacity=0.5),
+                        hoverlabel=dict(align='left')
+                    )
+                )
+        all_traces.append(traces)
+    # Track and count traces
+    nr_traces_per_set = [len(traces) for traces in all_traces]
+    trace_indices = [(0, nr_traces_per_set[0])]
+    for index, nr_traces in enumerate(nr_traces_per_set[1:]):
+        start = trace_indices[index][1]
+        end = nr_traces + start
+        trace_indices.append((start, end))
+    # Visualization
+    fig = go.Figure()
+    for traces in all_traces:
+        for trace in traces:
+            fig.add_trace(trace)
+    for index in range(len(fig.data)):
+        if index >= nr_traces_per_set[0]:
+            fig.data[index].visible = False
+    # Create and add slider
+    steps = []
+    for index, indices in enumerate(trace_indices):
+        step = dict(
+            method="update",
+            label=str(index),
+            args=[{"visible": [False] * len(fig.data)}]
+        )
+        for index in range(indices[1]-indices[0]):
+            step["args"][0]["visible"][index+indices[0]] = True
+        steps.append(step)
+    sliders = [dict(
+        currentvalue={"prefix": "Level: "},
+        pad={"t": 20},
+        steps=steps
+    )]
+    # Add grid in a 'plus' shape
+    x_range = (df.x.min() - abs((df.x.min()) * .15), df.x.max() + abs((df.x.max()) * .15))
+    y_range = (df.y.min() - abs((df.y.min()) * .15), df.y.max() + abs((df.y.max()) * .15))
+    fig.add_shape(type="line",
+                  x0=sum(x_range) / 2, y0=y_range[0], x1=sum(x_range) / 2, y1=y_range[1],
+                  line=dict(color="#CFD8DC", width=2))
+    fig.add_shape(type="line",
+                  x0=x_range[0], y0=sum(y_range) / 2, x1=x_range[1], y1=sum(y_range) / 2,
+                  line=dict(color="#9E9E9E", width=2))
+    fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10)
+    fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10)
+    # Stylize layout
+    fig.update_layout(
+        sliders=sliders,
+        template="simple_white",
+        title={
+            'text': f"{title}",
+            'x': 0.5,
+            'xanchor': 'center',
+            'yanchor': 'top',
+            'font': dict(
+                size=22,
+                color="Black")
+        },
+        width=width,
+        height=height,
+    )
+    fig.update_xaxes(visible=False)
+    fig.update_yaxes(visible=False)
+    return fig
+def visualize_barchart_custom(topic_model,
+                       topics: List[int] = None,
+                       top_n_topics: int = 8,
+                       n_words: int = 5,
+                       custom_labels: Union[bool, str] = False,
+                       title: str = "<b>Topic Word Scores</b>",
+                       width: int = 250,
+                       height: int = 250) -> go.Figure:
+    """ Visualize a barchart of selected topics
+    Arguments:
+        topic_model: A fitted BERTopic instance.
+        topics: A selection of topics to visualize.
+        top_n_topics: Only select the top n most frequent topics.
+        n_words: Number of words to show in a topic
+        custom_labels: If bool, whether to use custom topic labels that were defined using
+                       `topic_model.set_topic_labels`.
+                       If `str`, it uses labels from other aspects, e.g., "Aspect1".
+        title: Title of the plot.
+        width: The width of each figure.
+        height: The height of each figure.
+    Returns:
+        fig: A plotly figure
+    Examples:
+    To visualize the barchart of selected topics
+    simply run:
+    ```python
+    topic_model.visualize_barchart()
+    ```
+    Or if you want to save the resulting figure:
+    ```python
+    fig = topic_model.visualize_barchart()
+    fig.write_html("path/to/file.html")
+    ```
+    <iframe src="../../getting_started/visualization/bar_chart.html"
+    style="width:1100px; height: 660px; border: 0px;""></iframe>
+    """
+    colors = itertools.cycle(["#D55E00", "#0072B2", "#CC79A7", "#E69F00", "#56B4E9", "#009E73", "#F0E442"])
+    # Select topics based on top_n and topics args
+    freq_df = topic_model.get_topic_freq()
+    freq_df = freq_df.loc[freq_df.Topic != -1, :]
+    if topics is not None:
+        topics = list(topics)
+    elif top_n_topics is not None:
+        topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
+    else:
+        topics = sorted(freq_df.Topic.to_list()[0:6])
+    # Initialize figure
+    if isinstance(custom_labels, str):
+        subplot_titles = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topics]
+        subplot_titles = ["_".join([label[0] for label in labels[:4]]) for labels in subplot_titles]
+        subplot_titles = [label if len(label) < 30 else label[:27] + "..." for label in subplot_titles]
+    elif topic_model.custom_labels_ is not None and custom_labels:
+        subplot_titles = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in topics]
+    else:
+        subplot_titles = [f"Topic {topic}" for topic in topics]
+    columns = 4
+    rows = int(np.ceil(len(topics) / columns))
+    fig = make_subplots(rows=rows,
+                        cols=columns,
+                        shared_xaxes=False,
+                        horizontal_spacing=.1,
+                        vertical_spacing=.4 / rows if rows > 1 else 0,
+                        subplot_titles=subplot_titles)
+    # Add barchart for each topic
+    row = 1
+    column = 1
+    for topic in topics:
+        words = [word + "  " for word, _ in topic_model.get_topic(topic)][:n_words][::-1]
+        scores = [score for _, score in topic_model.get_topic(topic)][:n_words][::-1]
+        fig.add_trace(
+            go.Bar(x=scores,
+                   y=words,
+                   orientation='h',
+                   marker_color=next(colors)),
+            row=row, col=column)
+        if column == columns:
+            column = 1
+            row += 1
+        else:
+            column += 1
+    # Stylize graph
+    fig.update_layout(
+        template="plotly_white",
+        showlegend=False,
+        title={
+            'text': f"{title}",
+            'x': .5,
+            'xanchor': 'center',
+            'yanchor': 'top',
+            'font': dict(
+                size=16,
+                color="Black")
+        },
+        width=width*4,
+        height=height*rows if rows > 1 else height * 1.3,
+        hoverlabel=dict(
+            bgcolor="white",
+            font_size=16,
+            font_family="Rockwell"
+        ),
+    )
+    fig.update_xaxes(showgrid=True)
+    fig.update_yaxes(showgrid=True)
+    return fig

funcs/embeddings.py CHANGED Viewed

@@ -4,7 +4,6 @@ from torch import cuda
 from sklearn.pipeline import make_pipeline
 from sklearn.decomposition import TruncatedSVD
 from sklearn.feature_extraction.text import TfidfVectorizer
-from umap import UMAP
 random_seed = 42
@@ -20,13 +19,14 @@ def make_or_load_embeddings(docs, file_list, embeddings_out, embedding_model, em
         print("Embeddings not found. Loading or generating new ones.")
         embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
         if embeddings_file_names:
             print("Loading embeddings from file.")
-            embeddings_out = np.load(embeddings_file_names[0])['arr_0']
             # If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
-            if "compress" in embeddings_file_names[0]:
                 embeddings_out /= 100
         if not embeddings_file_names:
@@ -66,9 +66,9 @@ def make_or_load_embeddings(docs, file_list, embeddings_out, embedding_model, em
                 embeddings_out = np.round(embeddings_out, 3)
                 embeddings_out *= 100
-        return embeddings_out, None
     else:
         print("Found pre-loaded embeddings.")
-        return embeddings_out, None

 from sklearn.pipeline import make_pipeline
 from sklearn.decomposition import TruncatedSVD
 from sklearn.feature_extraction.text import TfidfVectorizer
 random_seed = 42
         print("Embeddings not found. Loading or generating new ones.")
         embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
         if embeddings_file_names:
+            embeddings_file_name = embeddings_file_names[0]
             print("Loading embeddings from file.")
+            embeddings_out = np.load(embeddings_file_name)['arr_0']
             # If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
+            if "compress" in embeddings_file_name:
                 embeddings_out /= 100
         if not embeddings_file_names:
                 embeddings_out = np.round(embeddings_out, 3)
                 embeddings_out *= 100
+        return embeddings_out
     else:
         print("Found pre-loaded embeddings.")
+        return embeddings_out

funcs/helper_functions.py CHANGED Viewed

@@ -6,6 +6,11 @@ import gradio as gr
 import gzip
 import pickle
 import numpy as np
 def detect_file_type(filename):
@@ -20,6 +25,8 @@ def detect_file_type(filename):
         return 'pkl.gz'
     elif filename.endswith('.pkl'):
         return 'pkl'
     else:
         raise ValueError("Unsupported file type.")
@@ -30,35 +37,45 @@ def read_file(filename):
     print("Loading in file")
     if file_type == 'csv':
-        file = pd.read_csv(filename, low_memory=False).reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
     elif file_type == 'xlsx':
-        file = pd.read_excel(filename).reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
     elif file_type == 'parquet':
-        file = pd.read_parquet(filename).reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
     elif file_type == 'pkl.gz':
         with gzip.open(filename, 'rb') as file:
             file = pickle.load(file)
             #file = pd.read_pickle(filename)
     elif file_type == 'pkl':
-        file = pickle.load(file)
     print("File load complete")
     return file
-def put_columns_in_df(in_file, in_bm25_column):
     '''
     When file is loaded, update the column dropdown choices and write to relevant data states.
     '''
     new_choices = []
     concat_choices = []
     file_list = [string.name for string in in_file]
-    data_file_names = [string.lower() for string in file_list if "npz" not in string.lower() and "pkl" not in string.lower()]
     if data_file_names:
         data_file_name = data_file_names[0]
         df = read_file(data_file_name)
         new_choices = list(df.columns)
         concat_choices.extend(new_choices)
@@ -72,13 +89,23 @@ def put_columns_in_df(in_file, in_bm25_column):
     if model_file_names:
         model_file_name = model_file_names[0]
         topic_model = read_file(model_file_name)
-        output_text = "Bertopic model loaded in"
-        return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, np.array([]), output_text, topic_model
     #The np.array([]) at the end is for clearing the embedding state when a new file is loaded
-    return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, np.array([]), output_text, None
 def get_file_path_end(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
@@ -134,4 +161,51 @@ def delete_files_in_folder(folder_path):
             else:
                 print(f"Skipping {file_path} as it is a directory")
         except Exception as e:
-            print(f"Failed to delete {file_path}. Reason: {e}")

 import gzip
 import pickle
 import numpy as np
+from bertopic import BERTopic
+from datetime import datetime
+today = datetime.now().strftime("%d%m%Y")
+today_rev = datetime.now().strftime("%Y%m%d")
 def detect_file_type(filename):
         return 'pkl.gz'
     elif filename.endswith('.pkl'):
         return 'pkl'
+    elif filename.endswith('.npz'):
+        return 'npz'
     else:
         raise ValueError("Unsupported file type.")
     print("Loading in file")
     if file_type == 'csv':
+        file = pd.read_csv(filename, low_memory=False)#.reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
     elif file_type == 'xlsx':
+        file = pd.read_excel(filename)#.reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
     elif file_type == 'parquet':
+        file = pd.read_parquet(filename)#.reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
     elif file_type == 'pkl.gz':
         with gzip.open(filename, 'rb') as file:
             file = pickle.load(file)
             #file = pd.read_pickle(filename)
     elif file_type == 'pkl':
+        file = BERTopic.load(filename)
+    elif file_type == 'npz':
+        file = np.load(filename)['arr_0']
+        # If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
+        if "compress" in filename:
+            file /= 100
     print("File load complete")
     return file
+def initial_file_load(in_file):
     '''
     When file is loaded, update the column dropdown choices and write to relevant data states.
     '''
     new_choices = []
     concat_choices = []
+    custom_labels = pd.DataFrame()
+    topic_model = None
+    embeddings = np.array([])
     file_list = [string.name for string in in_file]
+    data_file_names = [string.lower() for string in file_list if "npz" not in string.lower() and "pkl" not in string.lower() and "topic_list.csv" not in string.lower()]
     if data_file_names:
         data_file_name = data_file_names[0]
         df = read_file(data_file_name)
+        data_file_name_no_ext = get_file_path_end(data_file_name)
         new_choices = list(df.columns)
         concat_choices.extend(new_choices)
     if model_file_names:
         model_file_name = model_file_names[0]
         topic_model = read_file(model_file_name)
+        output_text = "Bertopic model loaded."
+    embedding_file_names = [string.lower() for string in file_list if "npz" in string.lower()]
+    if embedding_file_names:
+        embedding_file_name = embedding_file_names[0]
+        embeddings = read_file(embedding_file_name)
+        output_text = "Embeddings loaded."
+    label_file_names = [string.lower() for string in file_list if "topic_list" in string.lower()]
+    if label_file_names:
+        label_file_name = label_file_names[0]
+        custom_labels = read_file(label_file_name)
+        output_text = "Labels loaded."
     #The np.array([]) at the end is for clearing the embedding state when a new file is loaded
+    return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, output_text, topic_model, embeddings, data_file_name_no_ext, custom_labels
 def get_file_path_end(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
             else:
                 print(f"Skipping {file_path} as it is a directory")
         except Exception as e:
+            print(f"Failed to delete {file_path}. Reason: {e}")
+def save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model, progress=gr.Progress()):
+        progress(0.7, desc= "Checking data")
+        topic_dets = topic_model.get_topic_info()
+        if topic_dets.shape[0] == 1:
+            topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
+            topic_dets.to_csv(topic_det_output_name)
+            output_list.append(topic_det_output_name)
+            return output_list, "No topics found, original file returned"
+        progress(0.8, desc= "Saving output")
+        topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
+        topic_dets.to_csv(topic_det_output_name)
+        output_list.append(topic_det_output_name)
+        doc_det_output_name = "doc_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
+        doc_dets = topic_model.get_document_info(docs)[["Document",	"Topic", "Name", "Probability", "Representative_document"]]
+        doc_dets.to_csv(doc_det_output_name)
+        output_list.append(doc_det_output_name)
+        topics_text_out_str = str(topic_dets["Name"])
+        output_text = "Topics: " + topics_text_out_str
+        # Save topic model to file
+        if save_topic_model == "Yes":
+            print("Saving BERTopic model in .pkl format.")
+            topic_model_save_name_pkl = "output_model/" + data_file_name_no_ext + "_topics_" + today_rev + ".pkl"# + ".safetensors"
+            topic_model_save_name_zip = topic_model_save_name_pkl + ".zip"
+            # Clear folder before replacing files
+            #delete_files_in_folder(topic_model_save_name_pkl)
+            topic_model.save(topic_model_save_name_pkl, serialization='pickle', save_embedding_model=False, save_ctfidf=False)
+            # Zip file example
+            #zip_folder(topic_model_save_name_pkl, topic_model_save_name_zip)
+            output_list.append(topic_model_save_name_pkl)
+        return output_list, output_text

funcs/representation_model.py CHANGED Viewed

@@ -28,7 +28,7 @@ else:
     low_resource_mode = "Yes"
     n_gpu_layers = 0
-low_resource_mode = "No" # Override for testing
 #print("Running on device:", torch_device)
 n_threads = torch.get_num_threads()

     low_resource_mode = "Yes"
     n_gpu_layers = 0
+#low_resource_mode = "No" # Override for testing
 #print("Running on device:", torch_device)
 n_threads = torch.get_num_threads()

requirements.txt CHANGED Viewed

@@ -1,11 +1,12 @@
 gradio==3.50.0
-transformers
-accelerate
-torch
-llama-cpp-python
-bertopic
-spacy
-pyarrow
-faker
-presidio_analyzer
-presidio_anonymizer

 gradio==3.50.0
+transformers==4.37.1
+accelerate==0.26.1
+torch==2.1.2
+llama-cpp-python==0.2.33
+bertopic==0.16.0
+spacy==3.7.2
+pyarrow==14.0.2
+Faker==22.2.0
+presidio_analyzer==2.2.351
+presidio_anonymizer==2.2.351
+scipy==1.11.4