Spaces:

seanpedrickcase
/

topic_modelling

Running

App Files Files Community

Sonnyjim commited on Jan 25, 2024

Commit

ffe5eb2

1 Parent(s): 9eeba1e

More efficient embeddings save and representations load/process. Custom visualisation hover option added, formatting improvements. Version 0.1?

Browse files

Files changed (4) hide show

.gitignore +1 -0
app.py +78 -50
funcs/bertopic_vis_documents.py +245 -0
funcs/representation_model.py +1 -1

.gitignore CHANGED Viewed

@@ -7,6 +7,7 @@
 *.png
 *.safetensors
 *.json
 .ipynb_checkpoints/*
 old_code/*
 model/*

 *.png
 *.safetensors
 *.json
+*.html
 .ipynb_checkpoints/*
 old_code/*
 model/*

app.py CHANGED Viewed

@@ -80,7 +80,14 @@ hf_model_name =  'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1
 hf_model_file =   'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
-def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers, embeddings_out):
     all_tic = time.perf_counter()
@@ -97,8 +104,13 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
         in_label_list_first = in_label[0]
     else:
         in_label_list_first = in_colnames_list_first
     if anonymise_drop == "Yes":
         anon_tic = time.perf_counter()
         time_out = f"Creating visualisation took {all_toc - vis_tic:0.1f} seconds"
         in_files_anon_col, anonymisation_success = anon.anonymise_script(in_files, in_colnames_list_first, anon_strat="replace")
@@ -111,7 +123,7 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
         time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
     docs = list(in_files[in_colnames_list_first].str.lower())
-    label_col = in_files[in_label_list_first]
     # Check if embeddings are being loaded in
     ## Load in pre-embedded file if exists
@@ -144,6 +156,8 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
         umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
     embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embeddings_out, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
     vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
@@ -151,28 +165,27 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
     from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
     from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
-    print("Create LLM topic labels:", create_llm_topic_labels)
-    representation_model = create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode)
     if not candidate_topics:
         # Generate representation model here if topics won't be changed later
-        if reduce_outliers == "No":
-            topic_model = BERTopic( embedding_model=embedding_model_pipe,
-                                    vectorizer_model=vectoriser_model,
-                                    umap_model=umap_model,
-                                    min_topic_size = min_docs_slider,
-                                    nr_topics = max_topics_slider,
-                                    representation_model=representation_model,
-                                    verbose = True)
-        else:
-            topic_model = BERTopic( embedding_model=embedding_model_pipe,
-                                    vectorizer_model=vectoriser_model,
-                                    umap_model=umap_model,
-                                    min_topic_size = min_docs_slider,
-                                    nr_topics = max_topics_slider,
-                                    verbose = True)
         topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
@@ -189,25 +202,25 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
         zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
         # Generate representation model here if topics won't be changed later
-        if reduce_outliers == "No":
-            topic_model = BERTopic( embedding_model=embedding_model_pipe,
-                                    vectorizer_model=vectoriser_model,
-                                    umap_model=umap_model,
-                                    min_topic_size = min_docs_slider,
-                                    nr_topics = max_topics_slider,
-                                    zeroshot_topic_list = zero_shot_topics_lower,
-                                    zeroshot_min_similarity = 0.5,#0.7,
-                                    representation_model=representation_model,
-                                    verbose = True)
-        else:
-            topic_model = BERTopic( embedding_model=embedding_model_pipe,
-                                    vectorizer_model=vectoriser_model,
-                                    umap_model=umap_model,
-                                    min_topic_size = min_docs_slider,
-                                    nr_topics = max_topics_slider,
-                                    zeroshot_topic_list = zero_shot_topics_lower,
-                                    zeroshot_min_similarity = 0.5,#0.7,
-                                    verbose = True)
         topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
@@ -215,35 +228,43 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
         return "No topics found.", data_file_name, None
     else:
-        print("Preparing topic model outputs.")
-    # Reduce outliers if required
     if reduce_outliers == "Yes":
         print("Reducing outliers.")
         # Calculate the c-TF-IDF representation for each outlier document and find the best matching c-TF-IDF topic representation using cosine similarity.
         topics_text = topic_model.reduce_outliers(docs, topics_text, strategy="embeddings")
         # Then, update the topics to the ones that considered the new data
-        topic_model.update_topics(docs, topics=topics_text, vectorizer_model=vectoriser_model, representation_model=representation_model)
         print("Finished reducing outliers.")
     topic_dets = topic_model.get_topic_info()
-    #print(topic_dets.columns)
     if topic_dets.shape[0] == 1:
         topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
         topic_dets.to_csv(topic_det_output_name)
         output_list.append(topic_det_output_name)
-        return "No topics found, original file returned", output_list, None
     # Replace original labels with LLM labels
-    if "Mistral" in topic_model.get_topic_info().columns:
-        llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["Mistral"].values()]
         topic_model.set_topic_labels(llm_labels)
     else:
         topic_model.set_topic_labels(list(topic_dets["Name"]))
     # Outputs
     topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
     topic_dets.to_csv(topic_det_output_name)
@@ -288,10 +309,15 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
         output_list.append(embeddings_file_name)
     if visualise_topics == "Yes":
         # Visualise the topics:
         vis_tic = time.perf_counter()
         print("Creating visualisation")
-        topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
         all_toc = time.perf_counter()
         time_out = f"Creating visualisation took {all_toc - vis_tic:0.1f} seconds"
@@ -304,7 +330,7 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
         return output_text, output_list, topics_vis, embeddings_out
     all_toc = time.perf_counter()
-    time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
     print(time_out)
     return output_text, output_list, None, embeddings_out
@@ -321,7 +347,9 @@ with block:
     gr.Markdown(
     """
     # Topic modeller
-    Generate topics from open text in tabular data. Upload a file (csv, xlsx, or parquet), then specify the columns that you want to use to generate topics and use for labels in the visualisation. If you have an embeddings .npz file of the text made using the 'jina-embeddings-v2-small-en' model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available under the 'Options' tab.
     """)
     with gr.Tab("Load files and find topics"):
@@ -329,7 +357,7 @@ with block:
             in_files = gr.File(label="Input text from file", file_count="multiple")
             with gr.Row():
                 in_colnames = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column to find topics (first will be chosen if multiple selected).")
-                in_label = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column to for labelling documents in the output visualisation.")
         with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
             candidate_topics = gr.File(label="Input topics from file (csv). File should have at least one column with a header and topic keywords in cells below. Topics will be taken from the first column of the file. Currently not compatible with low-resource embeddings.")

 hf_model_file =   'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
+def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers, embeddings_out, progress=gr.Progress()):
+    progress(0, desc= "Loading data")
+    if not in_colnames or not in_label:
+        error_message = "Please enter one column name for the topics and another for the labelling."
+        print(error_message)
+        return error_message, None, None, embeddings_out
     all_tic = time.perf_counter()
         in_label_list_first = in_label[0]
     else:
         in_label_list_first = in_colnames_list_first
+    # Make sure format of input series is good
+    in_files[in_colnames_list_first] = in_files[in_colnames_list_first].fillna('').astype(str)
+    in_files[in_label_list_first] = in_files[in_label_list_first].fillna('').astype(str)
     if anonymise_drop == "Yes":
+        progress(0.1, desc= "Anonymising data")
         anon_tic = time.perf_counter()
         time_out = f"Creating visualisation took {all_toc - vis_tic:0.1f} seconds"
         in_files_anon_col, anonymisation_success = anon.anonymise_script(in_files, in_colnames_list_first, anon_strat="replace")
         time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
     docs = list(in_files[in_colnames_list_first].str.lower())
+    label_list = list(in_files[in_label_list_first])
     # Check if embeddings are being loaded in
     ## Load in pre-embedded file if exists
         umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
+    progress(0.2, desc= "Loading/creating embeddings")
     embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embeddings_out, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
     vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
     from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
     from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
+    progress(0.3, desc= "Embeddings loaded. Creating BERTopic model")
     if not candidate_topics:
         # Generate representation model here if topics won't be changed later
+        # if reduce_outliers == "No":
+        #     topic_model = BERTopic( embedding_model=embedding_model_pipe,
+        #                             vectorizer_model=vectoriser_model,
+        #                             umap_model=umap_model,
+        #                             min_topic_size = min_docs_slider,
+        #                             nr_topics = max_topics_slider,
+        #                             representation_model=representation_model,
+        #                             verbose = True)
+        topic_model = BERTopic( embedding_model=embedding_model_pipe,
+                                vectorizer_model=vectoriser_model,
+                                umap_model=umap_model,
+                                min_topic_size = min_docs_slider,
+                                nr_topics = max_topics_slider,
+                                verbose = True)
         topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
         zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
         # Generate representation model here if topics won't be changed later
+        # if reduce_outliers == "No":
+        #     topic_model = BERTopic( embedding_model=embedding_model_pipe,
+        #                             vectorizer_model=vectoriser_model,
+        #                             umap_model=umap_model,
+        #                             min_topic_size = min_docs_slider,
+        #                             nr_topics = max_topics_slider,
+        #                             zeroshot_topic_list = zero_shot_topics_lower,
+        #                             zeroshot_min_similarity = 0.5,#0.7,
+        #                             representation_model=representation_model,
+        #                             verbose = True)
+        # else:
+        topic_model = BERTopic( embedding_model=embedding_model_pipe,
+                                vectorizer_model=vectoriser_model,
+                                umap_model=umap_model,
+                                min_topic_size = min_docs_slider,
+                                nr_topics = max_topics_slider,
+                                zeroshot_topic_list = zero_shot_topics_lower,
+                                zeroshot_min_similarity = 0.5,#0.7,
+                                verbose = True)
         topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
         return "No topics found.", data_file_name, None
     else:
+        print("Topic model created.")
+    progress(0.5, desc= "Loading in representation model")
+    print("Create LLM topic labels:", create_llm_topic_labels)
+    representation_model = create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode)
+    # Reduce outliers if required, then update representation
     if reduce_outliers == "Yes":
+        progress(0.6, desc= "Reducing outliers then creating topic representations")
         print("Reducing outliers.")
         # Calculate the c-TF-IDF representation for each outlier document and find the best matching c-TF-IDF topic representation using cosine similarity.
         topics_text = topic_model.reduce_outliers(docs, topics_text, strategy="embeddings")
         # Then, update the topics to the ones that considered the new data
         print("Finished reducing outliers.")
+    progress(0.6, desc= "Creating topic representations")
+    topic_model.update_topics(docs, topics=topics_text, vectorizer_model=vectoriser_model, representation_model=representation_model)
     topic_dets = topic_model.get_topic_info()
     if topic_dets.shape[0] == 1:
         topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
         topic_dets.to_csv(topic_det_output_name)
         output_list.append(topic_det_output_name)
+        return "No topics found, original file returned", output_list, None, embeddings_out
     # Replace original labels with LLM labels
+    if "Phi" in topic_model.get_topic_info().columns:
+        llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["Phi"].values()]
         topic_model.set_topic_labels(llm_labels)
     else:
         topic_model.set_topic_labels(list(topic_dets["Name"]))
     # Outputs
+    progress(0.8, desc= "Saving output")
     topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
     topic_dets.to_csv(topic_det_output_name)
         output_list.append(embeddings_file_name)
     if visualise_topics == "Yes":
+        from funcs.bertopic_vis_documents import visualize_documents_custom
+        progress(0.9, desc= "Creating visualisation (this can take a while)")
         # Visualise the topics:
         vis_tic = time.perf_counter()
         print("Creating visualisation")
+        topics_vis = visualize_documents_custom(topic_model, docs, hover_labels = label_list, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
+        topics_vis_name = data_file_name_no_ext + '_' + 'visualisation_' + today_rev + '.html'
+        topics_vis.write_html(topics_vis_name)
+        output_list.append(topics_vis_name)
         all_toc = time.perf_counter()
         time_out = f"Creating visualisation took {all_toc - vis_tic:0.1f} seconds"
         return output_text, output_list, topics_vis, embeddings_out
     all_toc = time.perf_counter()
+    time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
     print(time_out)
     return output_text, output_list, None, embeddings_out
     gr.Markdown(
     """
     # Topic modeller
+    Generate topics from open text in tabular data. Upload a file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics, and another for labels in the visualisation. If you have an embeddings .npz file of the text made using the 'jina-embeddings-v2-small-en' model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available under the 'Options' tab.
+    Suggested test dataset: https://huggingface.co/datasets/rag-datasets/mini_wikipedia/tree/main/data (passages.parquet)
     """)
     with gr.Tab("Load files and find topics"):
             in_files = gr.File(label="Input text from file", file_count="multiple")
             with gr.Row():
                 in_colnames = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column to find topics (first will be chosen if multiple selected).")
+                in_label = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column for labelling documents in the output visualisation.")
         with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
             candidate_topics = gr.File(label="Input topics from file (csv). File should have at least one column with a header and topic keywords in cells below. Topics will be taken from the first column of the file. Currently not compatible with low-resource embeddings.")

funcs/bertopic_vis_documents.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+from umap import UMAP
+from typing import List, Union
+# Shamelessly taken and adapted from Bertopic original implementation here (Maarten Grootendorst): https://github.com/MaartenGr/BERTopic/blob/master/bertopic/plotting/_documents.py
+def visualize_documents_custom(topic_model,
+                        docs: List[str],
+                        hover_labels: List[str],
+                        topics: List[int] = None,
+                        embeddings: np.ndarray = None,
+                        reduced_embeddings: np.ndarray = None,
+                        sample: float = None,
+                        hide_annotations: bool = False,
+                        hide_document_hover: bool = False,
+                        custom_labels: Union[bool, str] = False,
+                        title: str = "<b>Documents and Topics</b>",
+                        width: int = 1200,
+                        height: int = 750):
+    """ Visualize documents and their topics in 2D
+    Arguments:
+        topic_model: A fitted BERTopic instance.
+        docs: The documents you used when calling either `fit` or `fit_transform`
+        topics: A selection of topics to visualize.
+                Not to be confused with the topics that you get from `.fit_transform`.
+                For example, if you want to visualize only topics 1 through 5:
+                `topics = [1, 2, 3, 4, 5]`.
+        embeddings: The embeddings of all documents in `docs`.
+        reduced_embeddings: The 2D reduced embeddings of all documents in `docs`.
+        sample: The percentage of documents in each topic that you would like to keep.
+                Value can be between 0 and 1. Setting this value to, for example,
+                0.1 (10% of documents in each topic) makes it easier to visualize
+                millions of documents as a subset is chosen.
+        hide_annotations: Hide the names of the traces on top of each cluster.
+        hide_document_hover: Hide the content of the documents when hovering over
+                             specific points. Helps to speed up generation of visualization.
+        custom_labels: If bool, whether to use custom topic labels that were defined using
+                       `topic_model.set_topic_labels`.
+                       If `str`, it uses labels from other aspects, e.g., "Aspect1".
+        title: Title of the plot.
+        width: The width of the figure.
+        height: The height of the figure.
+    Examples:
+    To visualize the topics simply run:
+    ```python
+    topic_model.visualize_documents(docs)
+    ```
+    Do note that this re-calculates the embeddings and reduces them to 2D.
+    The advised and prefered pipeline for using this function is as follows:
+    ```python
+    from sklearn.datasets import fetch_20newsgroups
+    from sentence_transformers import SentenceTransformer
+    from bertopic import BERTopic
+    from umap import UMAP
+    # Prepare embeddings
+    docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']
+    sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
+    embeddings = sentence_model.encode(docs, show_progress_bar=False)
+    # Train BERTopic
+    topic_model = BERTopic().fit(docs, embeddings)
+    # Reduce dimensionality of embeddings, this step is optional
+    # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
+    # Run the visualization with the original embeddings
+    topic_model.visualize_documents(docs, embeddings=embeddings)
+    # Or, if you have reduced the original embeddings already:
+    topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)
+    ```
+    Or if you want to save the resulting figure:
+    ```python
+    fig = topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)
+    fig.write_html("path/to/file.html")
+    ```
+    <iframe src="../../getting_started/visualization/documents.html"
+    style="width:1000px; height: 800px; border: 0px;""></iframe>
+    """
+    topic_per_doc = topic_model.topics_
+    # Add <br> tags to hover labels to get them to appear on multiple lines
+    def wrap_by_word(s, n):
+        '''returns a string where \\n is inserted between every n words'''
+        a = s.split()
+        ret = ''
+        for i in range(0, len(a), n):
+            ret += ' '.join(a[i:i+n]) + '<br>'
+        return ret
+    # Apply the function to every element in the list
+    hover_labels = [wrap_by_word(s, n=20) for s in hover_labels]
+    # Sample the data to optimize for visualization and dimensionality reduction
+    if sample is None or sample > 1:
+        sample = 1
+    indices = []
+    for topic in set(topic_per_doc):
+        s = np.where(np.array(topic_per_doc) == topic)[0]
+        size = len(s) if len(s) < 100 else int(len(s) * sample)
+        indices.extend(np.random.choice(s, size=size, replace=False))
+    indices = np.array(indices)
+    df = pd.DataFrame({"topic": np.array(topic_per_doc)[indices]})
+    df["doc"] = [docs[index] for index in indices]
+    df["hover_labels"] = [hover_labels[index] for index in indices]
+    df["topic"] = [topic_per_doc[index] for index in indices]
+    # Extract embeddings if not already done
+    if sample is None:
+        if embeddings is None and reduced_embeddings is None:
+            embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document")
+        else:
+            embeddings_to_reduce = embeddings
+    else:
+        if embeddings is not None:
+            embeddings_to_reduce = embeddings[indices]
+        elif embeddings is None and reduced_embeddings is None:
+            embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document")
+    # Reduce input embeddings
+    if reduced_embeddings is None:
+        umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit(embeddings_to_reduce)
+        embeddings_2d = umap_model.embedding_
+    elif sample is not None and reduced_embeddings is not None:
+        embeddings_2d = reduced_embeddings[indices]
+    elif sample is None and reduced_embeddings is not None:
+        embeddings_2d = reduced_embeddings
+    unique_topics = set(topic_per_doc)
+    if topics is None:
+        topics = unique_topics
+    # Combine data
+    df["x"] = embeddings_2d[:, 0]
+    df["y"] = embeddings_2d[:, 1]
+    # Prepare text and names
+    if isinstance(custom_labels, str):
+        names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in unique_topics]
+        names = ["_".join([label[0] for label in labels[:4]]) for labels in names]
+        names = [label if len(label) < 30 else label[:27] + "..." for label in names]
+    elif topic_model.custom_labels_ is not None and custom_labels:
+        names = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in unique_topics]
+    else:
+        names = [f"{topic}_" + "_".join([word for word, value in topic_model.get_topic(topic)][:3]) for topic in unique_topics]
+    # Visualize
+    fig = go.Figure()
+    # Outliers and non-selected topics
+    non_selected_topics = set(unique_topics).difference(topics)
+    if len(non_selected_topics) == 0:
+        non_selected_topics = [-1]
+    selection = df.loc[df.topic.isin(non_selected_topics), :]
+    selection["text"] = ""
+    selection.loc[len(selection), :] = [None, None, None, selection.x.mean(), selection.y.mean(), "Other documents"]
+    fig.add_trace(
+        go.Scattergl(
+            x=selection.x,
+            y=selection.y,
+            hovertext=selection.hover_labels if not hide_document_hover else None,
+            hoverinfo="text",
+            mode='markers+text',
+            name="other",
+            showlegend=False,
+            marker=dict(color='#CFD8DC', size=5, opacity=0.5),
+            hoverlabel=dict(align='left')
+        )
+    )
+    # Selected topics
+    for name, topic in zip(names, unique_topics):
+        if topic in topics and topic != -1:
+            selection = df.loc[df.topic == topic, :]
+            selection["text"] = ""
+            if not hide_annotations:
+                selection.loc[len(selection), :] = [None, None, selection.x.mean(), selection.y.mean(), name]
+            fig.add_trace(
+                go.Scattergl(
+                    x=selection.x,
+                    y=selection.y,
+                    hovertext=selection.hover_labels if not hide_document_hover else None,
+                    hoverinfo="text",
+                    text=selection.text,
+                    mode='markers+text',
+                    name=name,
+                    textfont=dict(
+                        size=12,
+                    ),
+                    marker=dict(size=5, opacity=0.5),
+                    hoverlabel=dict(align='left')
+            ))
+    # Add grid in a 'plus' shape
+    x_range = (df.x.min() - abs((df.x.min()) * .15), df.x.max() + abs((df.x.max()) * .15))
+    y_range = (df.y.min() - abs((df.y.min()) * .15), df.y.max() + abs((df.y.max()) * .15))
+    fig.add_shape(type="line",
+                  x0=sum(x_range) / 2, y0=y_range[0], x1=sum(x_range) / 2, y1=y_range[1],
+                  line=dict(color="#CFD8DC", width=2))
+    fig.add_shape(type="line",
+                  x0=x_range[0], y0=sum(y_range) / 2, x1=x_range[1], y1=sum(y_range) / 2,
+                  line=dict(color="#9E9E9E", width=2))
+    fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10)
+    fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10)
+    # Stylize layout
+    fig.update_layout(
+        template="simple_white",
+        title={
+            'text': f"{title}",
+            'x': 0.5,
+            'xanchor': 'center',
+            'yanchor': 'top',
+            'font': dict(
+                size=22,
+                color="Black")
+        },
+        hoverlabel_align = 'left',
+        width=width,
+        height=height
+    )
+    fig.update_xaxes(visible=False)
+    fig.update_yaxes(visible=False)
+    return fig

funcs/representation_model.py CHANGED Viewed

@@ -168,7 +168,7 @@ def create_representation_model(create_llm_topic_labels, llm_config, hf_model_na
         # All representation models
         representation_model = {
         "KeyBERT": keybert,
-        "Mistral": llm_model
         }
     elif create_llm_topic_labels == "No":

         # All representation models
         representation_model = {
         "KeyBERT": keybert,
+        "Phi": llm_model
         }
     elif create_llm_topic_labels == "No":