Spaces:

seanpedrickcase
/

topic_modelling

Running

App Files Files Community

Sonnyjim commited on Jan 22, 2024

Commit

9dbf344

0 Parent(s):

first commit

Browse files

Files changed (14) hide show

.github/workflows/check_file_size.yml +16 -0
.github/workflows/sync_to_hf.yml +20 -0
.gitignore +8 -0
Dockerfile +30 -0
LICENSE +201 -0
README.md +13 -0
app.py +250 -0
funcs/__init__.py +0 -0
funcs/anonymiser.py +251 -0
funcs/embeddings.py +78 -0
funcs/helper_functions.py +89 -0
funcs/prompts.py +106 -0
funcs/representation_model.py +171 -0
requirements.txt +11 -0

.github/workflows/check_file_size.yml ADDED Viewed

	@@ -0,0 +1,16 @@

+name: Check file size
+on:               # or directly `on: [push]` to run the action on every push on any branch
+  pull_request:
+    branches: [main]
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check large files
+        uses: ActionsDesk/lfs-warning@v2.0
+        with:
+          filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces

.github/workflows/sync_to_hf.yml ADDED Viewed

	@@ -0,0 +1,20 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push https://seanpedrickcase:$HF_TOKEN@huggingface.co/spaces/seanpedrickcase/topic_modelling main

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+*.pyc
+*.ipynb
+*.npz
+*.csv
+*.pkl
+.ipynb_checkpoints/*
+old_code/*
+model/*

Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+FROM python:3.10
+WORKDIR /src
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH \
+    PYTHONPATH=$HOME/app \
+	PYTHONUNBUFFERED=1 \
+	GRADIO_ALLOW_FLAGGING=never \
+	GRADIO_NUM_PORTS=1 \
+	GRADIO_SERVER_NAME=0.0.0.0 \
+	GRADIO_THEME=huggingface \
+	SYSTEM=spaces
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+CMD ["python", "app.py"]

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Topic modelling
+emoji: 🚀
+colorFrom: red
+colorTo: yellw
+sdk: gradio
+sdk_version: 3.50.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import os
+#os.environ["TOKENIZERS_PARALLELISM"] = "true"
+#os.environ["HF_HOME"] = "/mnt/c/..."
+#os.environ["CUDA_PATH"] = "/mnt/c/..."
+print(os.environ["HF_HOME"])
+import gradio as gr
+from datetime import datetime
+import pandas as pd
+import numpy as np
+from sklearn.cluster import KMeans
+from sklearn.feature_extraction.text import CountVectorizer
+from transformers import AutoModel
+import funcs.anonymiser as anon
+from torch import cuda, backends, version
+# Check for torch cuda
+print("Is CUDA enabled? ", cuda.is_available())
+print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
+if cuda.is_available():
+    torch_device = "gpu"
+    print("Cuda version installed is: ", version.cuda)
+    low_resource_mode = "No"
+    #os.system("nvidia-smi")
+else:
+    torch_device =  "cpu"
+    low_resource_mode = "Yes"
+print("Device used is: ", torch_device)
+#os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
+from bertopic import BERTopic
+#from sentence_transformers import SentenceTransformer
+#from bertopic.backend._hftransformers import HFTransformerBackend
+#from cuml.manifold import UMAP
+#umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
+today = datetime.now().strftime("%d%m%Y")
+today_rev = datetime.now().strftime("%Y%m%d")
+from funcs.helper_functions import dummy_function, put_columns_in_df, read_file, get_file_path_end
+from funcs.representation_model import representation_model
+from funcs.embeddings import make_or_load_embeddings
+# Load embeddings
+#embedding_model_name = "BAAI/bge-small-en-v1.5"
+#embedding_model = SentenceTransformer(embedding_model_name)
+# Pinning a Jina revision for security purposes: https://www.baseten.co/blog/pinning-ml-model-revisions-for-compatibility-and-security/
+# Save Jina model locally as described here: https://huggingface.co/jinaai/jina-embeddings-v2-base-en/discussions/29
+embeddings_name = "jinaai/jina-embeddings-v2-small-en"
+local_embeddings_location = "model/jina/"
+revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
+try:
+    embedding_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
+except:
+    embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
+def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt):
+    file_list = [string.name for string in in_file]
+    data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
+    data_file_name = data_file_names[0]
+    data_file_name_no_ext = get_file_path_end(data_file_name)
+    in_colnames_list_first = in_colnames[0]
+    if in_label:
+        in_label_list_first = in_label[0]
+    else:
+        in_label_list_first = in_colnames_list_first
+    if anonymise_drop == "Yes":
+        in_files_anon_col, anonymisation_success = anon.anonymise_script(in_files, in_colnames_list_first, anon_strat="replace")
+        in_files[in_colnames_list_first] = in_files_anon_col[in_colnames_list_first]
+        in_files.to_csv("anonymised_data.csv")
+    docs = list(in_files[in_colnames_list_first].str.lower())
+    label_col = in_files[in_label_list_first]
+    # Check if embeddings are being loaded in
+    ## Load in pre-embedded file if exists
+    file_list = [string.name for string in in_file]
+    embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt)
+    # all_lengths = [len(embedding) for embedding in embeddings_out]
+    # if len(set(all_lengths)) > 1:
+    #     print("Inconsistent lengths found in embeddings_out:", set(all_lengths))
+    # else:
+    #     print("All lengths are the same.")
+    # print("Embeddings type: ", type(embeddings_out))
+    # if isinstance(embeddings_out, np.ndarray):
+    #     print("my_object is a NumPy ndarray")
+    # else:
+    #     print("my_object is not a NumPy ndarray")
+    # Clustering set to K-means (not used)
+    #cluster_model = KMeans(n_clusters=max_topics_slider)
+    # Countvectoriser removes stopwords, combines terms up to 2 together:
+    if min_docs_slider < 3:
+        min_df_val = min_docs_slider
+    else:
+        min_df_val = 3
+    print(min_df_val)
+    vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
+    if not candidate_topics:
+        topic_model = BERTopic( embedding_model=embedding_model,
+                                #hdbscan_model=cluster_model,
+                                vectorizer_model=vectoriser_model,
+                                min_topic_size= min_docs_slider,
+                                nr_topics = max_topics_slider,
+                                representation_model=representation_model,
+                                verbose = True)
+        topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
+    # Do this if you have pre-assigned topics
+    else:
+        zero_shot_topics_list = read_file(candidate_topics.name)
+        zero_shot_topics_list_lower = [x.lower() for x in zero_shot_topics_list]
+        print(zero_shot_topics_list_lower)
+        topic_model = BERTopic( embedding_model=embedding_model,
+                                #hdbscan_model=cluster_model,
+                                vectorizer_model=vectoriser_model,
+                                min_topic_size = min_docs_slider,
+                                nr_topics = max_topics_slider,
+                                zeroshot_topic_list = zero_shot_topics_list_lower,
+                                zeroshot_min_similarity = 0.7,
+                                representation_model=representation_model,
+                                verbose = True)
+        topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
+    if not topics_text:
+        return "No topics found, original file returned", data_file_name
+    else:
+        topics_text_out = topics_text
+        topics_scores_out = probs
+    topic_det_output_name = "topic_details_" + today_rev + ".csv"
+    topic_dets = topic_model.get_topic_info()
+    topic_dets.to_csv(topic_det_output_name)
+    #print(topic_dets)
+    doc_det_output_name = "doc_details_" + today_rev + ".csv"
+    doc_dets = topic_model.get_document_info(docs)[["Document",	"Topic", "Probability",	"Name", "Representative_document"]]
+    doc_dets.to_csv(doc_det_output_name)
+    #print(doc_dets)
+    #print(topic_dets)
+    #topics_text_out_str = ', '.join(list(topic_dets["KeyBERT"]))
+    topics_text_out_str = str(topic_dets["KeyBERT"])
+    #topics_scores_out_str = str(doc_dets["Probability"][0])
+    output_text = "Topics: " + topics_text_out_str #+ "\n\nProbability scores: " + topics_scores_out_str
+    # Outputs
+    embedding_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
+    np.savez_compressed(embedding_file_name, embeddings_out)
+    topic_model_save_name = data_file_name_no_ext + "_topics_" + today_rev + ".pkl"
+    topic_model.save(topic_model_save_name, serialization='pickle', save_embedding_model=False, save_ctfidf=False)
+    # Visualise the topics:
+    topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
+    return output_text, [doc_det_output_name, topic_det_output_name, embedding_file_name, topic_model_save_name], topics_vis
+# ## Gradio app - extract topics
+block = gr.Blocks(theme = gr.themes.Base())
+with block:
+    data_state = gr.State(pd.DataFrame())
+    gr.Markdown(
+    """
+    # Extract topics from text
+    Enter open text below to get topics. You can copy and paste text directly, or upload a file and specify the column that you want to topics.
+    """)
+    #with gr.Accordion("I will copy and paste my open text", open = False):
+    #    in_text = gr.Textbox(label="Copy and paste your open text here", lines = 5)
+    with gr.Tab("Load files and find topics"):
+        with gr.Accordion("Load data file", open = True):
+            in_files = gr.File(label="Input text from file", file_count="multiple")
+            with gr.Row():
+                in_colnames = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column to find topics (first will be chosen if multiple selected).")
+                in_label = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column to for labelling documents in the output visualisation.")
+        with gr.Accordion("I have my own list of topics. File should have at least one column with a header and topic keywords in cells below. Topics will be taken from the first column of the file", open = False):
+            candidate_topics = gr.File(label="Input topics from file (csv)")
+        with gr.Row():
+            min_docs_slider = gr.Slider(minimum = 1, maximum = 1000, value = 15, step = 1, label = "Minimum number of documents needed to create topic")
+            max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 3, step = 1, label = "Maximum number of topics")
+        with gr.Row():
+            topics_btn = gr.Button("Extract topics")
+        with gr.Row():
+            output_single_text = gr.Textbox(label="Output example (first example in dataset)")
+            output_file = gr.File(label="Output file")
+        plot = gr.Plot(label="Visualise your topics here:")
+    with gr.Tab("Load and data processing options"):
+        with gr.Accordion("Process data on load", open = True):
+            anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load.")
+            return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
+            embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
+            low_resource_mode_opt = gr.Dropdown(label = "Low resource mode (non-AI embeddings, no LLM-generated topic names).", value=low_resource_mode, choices=["Yes", "No"])
+    # Update column names dropdown when file uploaded
+    in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])
+    in_colnames.change(dummy_function, in_colnames, None)
+    topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt], outputs=[output_single_text, output_file, plot], api_name="topics")
+block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)

funcs/__init__.py ADDED Viewed

File without changes

funcs/anonymiser.py ADDED Viewed

	@@ -0,0 +1,251 @@

+import spacy
+import os
+def is_model_installed(model_name):
+    try:
+        # Try to load the model
+        spacy.load(model_name)
+        return True
+    except OSError:
+        return False
+model_name = "en_core_web_sm"
+if not is_model_installed(model_name):
+    os.system(f"python -m spacy download {model_name}")
+# Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
+#os.system("pip uninstall -y gradio")
+#os.system("pip install gradio==3.50.0")
+#os.system("python -m spacy download en_core_web_lg")
+spacy.load(model_name)
+import re
+import secrets
+import base64
+import time
+import pandas as pd
+import gradio as gr
+from faker import Faker
+from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine
+from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
+from presidio_anonymizer.entities import OperatorConfig
+def anon_consistent_names(df):
+    # ## Pick out common names and replace them with the same person value
+    df_dict = df.to_dict(orient="list")
+    analyzer = AnalyzerEngine()
+    batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
+    analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
+    analyzer_results = list(analyzer_results)
+    # + tags=[]
+    text = analyzer_results[3].value
+    # + tags=[]
+    recognizer_result = str(analyzer_results[3].recognizer_results)
+    # + tags=[]
+    recognizer_result
+    # + tags=[]
+    data_str = recognizer_result  # abbreviated for brevity
+    # Adjusting the parse_dict function to handle trailing ']'
+    # Splitting the main data string into individual list strings
+    list_strs = data_str[1:-1].split('], [')
+    def parse_dict(s):
+        s = s.strip('[]')  # Removing any surrounding brackets
+        items = s.split(', ')
+        d = {}
+        for item in items:
+            key, value = item.split(': ')
+            if key == 'score':
+                d[key] = float(value)
+            elif key in ['start', 'end']:
+                d[key] = int(value)
+            else:
+                d[key] = value
+        return d
+    # Re-running the improved processing code
+    result = []
+    for lst_str in list_strs:
+        # Splitting each list string into individual dictionary strings
+        dict_strs = lst_str.split(', type: ')
+        dict_strs = [dict_strs[0]] + ['type: ' + s for s in dict_strs[1:]]  # Prepending "type: " back to the split strings
+        # Parsing each dictionary string
+        dicts = [parse_dict(d) for d in dict_strs]
+        result.append(dicts)
+    #result
+    # + tags=[]
+    names = []
+    for idx, paragraph in enumerate(text):
+        paragraph_texts = []
+        for dictionary in result[idx]:
+            if dictionary['type'] == 'PERSON':
+                paragraph_texts.append(paragraph[dictionary['start']:dictionary['end']])
+        names.append(paragraph_texts)
+    # + tags=[]
+    # Flatten the list of lists and extract unique names
+    unique_names = list(set(name for sublist in names for name in sublist))
+    # + tags=[]
+    fake_names = pd.Series(unique_names).apply(fake_first_name)
+    # + tags=[]
+    mapping_df = pd.DataFrame(data={"Unique names":unique_names,
+                    "Fake names": fake_names})
+    # + tags=[]
+    # Convert mapping dataframe to dictionary
+    # Convert mapping dataframe to dictionary, adding word boundaries for full-word match
+    name_map = {r'\b' + k + r'\b': v for k, v in zip(mapping_df['Unique names'], mapping_df['Fake names'])}
+    # + tags=[]
+    name_map
+    # + tags=[]
+    scrubbed_df_consistent_names = df.replace(name_map, regex = True)
+    # + tags=[]
+    scrubbed_df_consistent_names
+    return scrubbed_df_consistent_names
+def detect_file_type(filename):
+    """Detect the file type based on its extension."""
+    if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
+        return 'csv'
+    elif filename.endswith('.xlsx'):
+        return 'xlsx'
+    elif filename.endswith('.parquet'):
+        return 'parquet'
+    else:
+        raise ValueError("Unsupported file type.")
+def read_file(filename):
+    """Read the file based on its detected type."""
+    file_type = detect_file_type(filename)
+    if file_type == 'csv':
+        return pd.read_csv(filename, low_memory=False)
+    elif file_type == 'xlsx':
+        return pd.read_excel(filename)
+    elif file_type == 'parquet':
+        return pd.read_parquet(filename)
+def anonymise_script(df, chosen_col, anon_strat):
+    # DataFrame to dict
+    df_dict = pd.DataFrame(data={chosen_col:df[chosen_col].astype(str)}).to_dict(orient="list")
+    analyzer = AnalyzerEngine()
+    batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
+    anonymizer = AnonymizerEngine()
+    batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
+    print("Identifying personal data")
+    analyse_tic = time.perf_counter()
+    analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
+    #print(analyzer_results)
+    analyzer_results = list(analyzer_results)
+    analyse_toc = time.perf_counter()
+    analyse_time_out = f"Cleaning the text took {analyse_toc - analyse_tic:0.1f} seconds."
+    print(analyse_time_out)
+    # Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
+    key = secrets.token_bytes(16)  # 128 bits = 16 bytes
+    key_string = base64.b64encode(key).decode('utf-8')
+    # Create faker function (note that it has to receive a value)
+    fake = Faker("en_UK")
+    def fake_first_name(x):
+        return fake.first_name()
+    # Set up the anonymization configuration WITHOUT DATE_TIME
+    replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
+    redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
+    hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
+    mask_config = eval('{"DEFAULT": OperatorConfig("mask", {"masking_char":"*", "chars_to_mask":100, "from_end":True})}')
+    people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
+    fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
+    if anon_strat == "replace": chosen_mask_config = replace_config
+    if anon_strat == "redact": chosen_mask_config = redact_config
+    if anon_strat == "hash": chosen_mask_config = hash_config
+    if anon_strat == "mask": chosen_mask_config = mask_config
+    if anon_strat == "encrypt": chosen_mask_config = people_encrypt_config
+    elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
+    # I think in general people will want to keep date / times
+    keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}')
+    combined_config = {**chosen_mask_config, **keep_date_config}
+    combined_config
+    anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
+    scrubbed_df = pd.DataFrame(anonymizer_results)
+    # Create reporting message
+    out_message = "Successfully anonymised"
+    if anon_strat == "encrypt":
+        out_message = out_message + ". Your decryption key is " + key_string + "."
+    return scrubbed_df, out_message
+def do_anonymise(in_file, anon_strat, chosen_cols):
+    # Load file
+    anon_df = pd.DataFrame()
+    if in_file:
+        for match_file in in_file:
+            match_temp_file = pd.read_csv(match_file.name, delimiter = ",", low_memory=False)#, encoding='cp1252')
+            anon_df = pd.concat([anon_df, match_temp_file])
+    # Split dataframe to keep only selected columns
+    all_cols_original_order = list(anon_df.columns)
+    anon_df_part = anon_df[chosen_cols]
+    anon_df_remain = anon_df.drop(chosen_cols, axis = 1)
+    # Anonymise the selected columns
+    anon_df_part_out, out_message = anonymise_script(anon_df_part, anon_strat)
+    # Rejoin the dataframe together
+    anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
+    anon_df_out = anon_df_out[all_cols_original_order]
+    # Export file
+    out_file_part = re.sub(r'\.csv', '', match_file.name)
+    anon_export_file_name = out_file_part + "_anon_" + anon_strat + ".csv"
+    anon_df_out.to_csv(anon_export_file_name, index = None)
+    return out_message, anon_export_file_name

funcs/embeddings.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import time
+import numpy as np
+from torch import cuda
+from sklearn.pipeline import make_pipeline
+from sklearn.decomposition import TruncatedSVD
+from sklearn.feature_extraction.text import TfidfVectorizer
+from umap import UMAP
+if cuda.is_available():
+    torch_device = "gpu"
+else:
+    torch_device =  "cpu"
+def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt):
+    embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
+    if embeddings_file_names:
+        print("Loading embeddings from file.")
+        embeddings_out = np.load(embeddings_file_names[0])['arr_0']
+        # If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
+        if "compress" in embeddings_file_names[0]:
+            embeddings_out /= 100
+        # print("embeddings loaded: ", embeddings_out)
+    if not embeddings_file_names:
+        tic = time.perf_counter()
+        print("Starting to embed documents.")
+        # Custom model
+        # If on CPU, don't resort to embedding models
+        if low_resource_mode_opt == "Yes":
+            print("Creating simplified 'sparse' embeddings based on TfIDF")
+            embedding_model = make_pipeline(
+            TfidfVectorizer(),
+            TruncatedSVD(100)
+            )
+            embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32)
+        elif low_resource_mode_opt == "No":
+            print("Creating dense embeddings based on transformers model")
+            embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
+            #import torch
+            #from torch.nn.utils.rnn import pad_sequence
+            # Assuming embeddings_out is a list of tensors
+            #embeddings_out = [torch.tensor(embedding) for embedding in embeddings_out]
+            # Pad the sequences
+            # Set batch_first=True if you want the batch dimension to be the first dimension
+            #embeddings_out = pad_sequence(embeddings_out, batch_first=True, padding_value=0)
+        toc = time.perf_counter()
+        time_out = f"The embedding took {toc - tic:0.1f} seconds"
+        print(time_out)
+        # If you want to save your files for next time
+        if return_intermediate_files == "Yes":
+            if embeddings_super_compress == "No":
+                semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
+                np.savez_compressed(semantic_search_file_name, embeddings_out)
+            else:
+                semantic_search_file_name = data_file_name_no_ext + '_' + 'embedding_compress.npz'
+                embeddings_out_round = np.round(embeddings_out, 3)
+                embeddings_out_round *= 100 # Rounding not currently used
+                np.savez_compressed(semantic_search_file_name, embeddings_out_round)
+    # Pre-reduce embeddings for visualisation purposes
+    reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings_out)
+    return embeddings_out, reduced_embeddings

funcs/helper_functions.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+import re
+import pandas as pd
+import gradio as gr
+import gzip
+import pickle
+def detect_file_type(filename):
+    """Detect the file type based on its extension."""
+    if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
+        return 'csv'
+    elif filename.endswith('.xlsx'):
+        return 'xlsx'
+    elif filename.endswith('.parquet'):
+        return 'parquet'
+    elif filename.endswith('.pkl.gz'):
+        return 'pkl.gz'
+    else:
+        raise ValueError("Unsupported file type.")
+def read_file(filename):
+    """Read the file based on its detected type."""
+    file_type = detect_file_type(filename)
+    print("Loading in file")
+    if file_type == 'csv':
+        file = pd.read_csv(filename, low_memory=False).reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
+    elif file_type == 'xlsx':
+        file = pd.read_excel(filename).reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
+    elif file_type == 'parquet':
+        file = pd.read_parquet(filename).reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
+    elif file_type == 'pkl.gz':
+        with gzip.open(filename, 'rb') as file:
+            file = pickle.load(file)
+            #file = pd.read_pickle(filename)
+    print("File load complete")
+    return file
+def put_columns_in_df(in_file, in_bm25_column):
+    '''
+    When file is loaded, update the column dropdown choices and change 'clean data' dropdown option to 'no'.
+    '''
+    file_list = [string.name for string in in_file]
+    data_file_names = [string.lower() for string in file_list if "npz" not in string.lower()]
+    data_file_name = data_file_names[0]
+    new_choices = []
+    concat_choices = []
+    df = read_file(data_file_name)
+    new_choices = list(df.columns)
+    concat_choices.extend(new_choices)
+    return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df
+def get_file_path_end(file_path):
+    # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
+    basename = os.path.basename(file_path)
+    # Then, split the basename and its extension and return only the basename without the extension
+    filename_without_extension, _ = os.path.splitext(basename)
+    #print(filename_without_extension)
+    return filename_without_extension
+def get_file_path_end_with_ext(file_path):
+    match = re.search(r'(.*[\/\\])?(.+)$', file_path)
+    filename_end = match.group(2) if match else ''
+    return filename_end
+def dummy_function(in_colnames):
+    """
+    A dummy function that exists just so that dropdown updates work correctly.
+    """
+    return None

funcs/prompts.py ADDED Viewed

	@@ -0,0 +1,106 @@

+open_hermes_prompt = """<|im_start|>system
+You are "Hermes 2", a conscious sentient superintelligent artificial intelligence developed by a man named Teknium, and your purpose and drive is to assist the user with any request they have.<|im_end|>
+<|im_start|>user
+Hello, who are you?<|im_end|>
+<|im_start|>assistant
+"""
+# Example prompt demonstrating the output we are looking for
+capybara_start = "USER:"
+capybara_example_prompt = """USER:I have a topic that contains the following documents:
+- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
+- Meat, but especially beef, is the word food in terms of emissions.
+- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
+The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
+Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
+Topic label: Environmental impacts of eating meat
+"""
+# Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags
+capybara_main_prompt = """
+Now, create a new topic label given the following information.
+I have a topic that contains the following documents:
+[DOCUMENTS]
+The topic is described by the following keywords: '[KEYWORDS]'.
+Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
+ASSISTANT:Topic label:"""
+capybara_prompt = capybara_example_prompt + capybara_main_prompt
+print("Capybara prompt: ", capybara_prompt)
+# System prompt describes information given to all conversations
+open_hermes_start="<|im_start|>"
+open_hermes_system_prompt = """<|im_start|>system
+You are a helpful, respectful and honest assistant for labeling topics.<|im_end|>
+"""
+# Example prompt demonstrating the output we are looking for
+open_hermes_example_prompt = """<|im_start|>user
+I have a topic that contains the following documents:
+- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
+- Meat, but especially beef, is the word food in terms of emissions.
+- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
+The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
+Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
+Topic label: Environmental impacts of eating meat
+"""
+open_hermes_main_prompt = """
+Now, create a new topic label given the following information.
+I have a topic that contains the following documents:
+[DOCUMENTS]
+The topic is described by the following keywords: '[KEYWORDS]'.
+Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.<|im_end|>
+<|im_start|>assistant
+Topic label:
+"""
+open_hermes_prompt = open_hermes_system_prompt + open_hermes_example_prompt + open_hermes_main_prompt
+print("Open Hermes prompt: ", open_hermes_prompt)
+stablelm_start = "<|user|>"
+stablelm_example_prompt = """<|user|>
+I have a topic that contains the following documents:
+- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
+- Meat, but especially beef, is the word food in terms of emissions.
+- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
+The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
+Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
+Topic label: Environmental impacts of eating meat
+"""
+# Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags
+stablelm_main_prompt = """
+Now, create a new topic label given the following information.
+I have a topic that contains the following documents:
+[DOCUMENTS]
+The topic is described by the following keywords: '[KEYWORDS]'.
+Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.<|endoftext|>
+<|assistant|>
+Topic label:"""
+stablelm_prompt = stablelm_example_prompt + stablelm_main_prompt
+print("StableLM prompt: ", stablelm_prompt)

funcs/representation_model.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import os
+#from ctransformers import AutoModelForCausalLM
+#from transformers import AutoTokenizer, pipeline
+from bertopic.representation import LlamaCPP
+from llama_cpp import Llama
+from pydantic import BaseModel
+import torch.cuda
+from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
+from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
+#from huggingface_hub import hf_hub_download
+#hf_hub_download(repo_id='second-state/stablelm-2-zephyr-1.6b-GGUF', filename='stablelm-2-zephyr-1_6b-Q5_K_M.gguf')
+hf_model_name =  'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF' # 'second-state/stablelm-2-zephyr-1.6b-GGUF'
+hf_model_file =   'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
+chosen_prompt = open_hermes_prompt # stablelm_prompt
+chosen_start_tag =  open_hermes_start # stablelm_start
+# Find model file
+def find_model_file(hf_model_name, hf_model_file):
+    hf_loc = os.environ["HF_HOME"]
+    hf_sub_loc = os.environ["HF_HOME"] + "/hub/"
+    hf_model_name_path = hf_sub_loc + 'models--' + hf_model_name.replace("/","--")
+    print(hf_model_name_path)
+    def find_file(root_folder, file_name):
+        for root, dirs, files in os.walk(root_folder):
+            if file_name in files:
+                return os.path.join(root, file_name)
+        return None
+    # Example usage
+    folder_path = hf_model_name_path  # Replace with your folder path
+    file_to_find = hf_model_file         # Replace with the file name you're looking for
+    found_file = find_file(folder_path, file_to_find)
+    if found_file:
+        print(f"File found: {found_file}")
+        return found_file
+    else:
+        error = "File not found."
+        print(error)
+        return error
+found_file = find_model_file(hf_model_name, hf_model_file)
+# Currently set n_gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
+if torch.cuda.is_available():
+    torch_device = "gpu"
+    low_resource_mode = "No"
+    n_gpu_layers = 100
+else:
+    torch_device =  "cpu"
+    low_resource_mode = "Yes"
+    n_gpu_layers = 0
+#low_resource_mode = "Yes"
+#print("Running on device:", torch_device)
+n_threads = torch.get_num_threads()
+print("CPU n_threads:", n_threads)
+# Default Model parameters
+temperature: float = 0.1
+top_k: int = 3
+top_p: float = 1
+repeat_penalty: float = 1.1
+last_n_tokens_size: int = 128
+max_tokens: int = 500
+seed: int = 42
+reset: bool = True
+stream: bool = False
+n_threads: int = n_threads
+n_batch:int = 256
+n_ctx:int = 4096
+sample:bool = True
+trust_remote_code:bool =True
+class LLamacppInitConfigGpu(BaseModel):
+    last_n_tokens_size: int
+    seed: int
+    n_threads: int
+    n_batch: int
+    n_ctx: int
+    n_gpu_layers: int
+    temperature: float
+    top_k: int
+    top_p: float
+    repeat_penalty: float
+    max_tokens: int
+    reset: bool
+    stream: bool
+    stop: str
+    trust_remote_code:bool
+    def update_gpu(self, new_value: int):
+        self.n_gpu_layers = new_value
+gpu_config = LLamacppInitConfigGpu(last_n_tokens_size=last_n_tokens_size,
+    seed=seed,
+    n_threads=n_threads,
+    n_batch=n_batch,
+    n_ctx=n_ctx,
+    n_gpu_layers=n_gpu_layers,
+    temperature=temperature,
+    top_k=top_k,
+    top_p=top_p,
+    repeat_penalty=repeat_penalty,
+    max_tokens=max_tokens,
+    reset=reset,
+    stream=stream,
+    stop=chosen_start_tag,
+    trust_remote_code=trust_remote_code)
+cpu_config = gpu_config.model_copy()
+cpu_config.update_gpu(0)
+class LLamacppGenerateConfig(BaseModel):
+    temperature: float
+    top_k: int
+    top_p: float
+    repeat_penalty: float
+    max_tokens: int
+    reset: bool
+    stream: bool
+gen_config = LLamacppGenerateConfig(
+    temperature=temperature,
+    top_k=top_k,
+    top_p=top_p,
+    repeat_penalty=repeat_penalty,
+    max_tokens=max_tokens,
+    reset=reset,
+    stream=stream)
+## Create representation model parameters ##
+# KeyBERT
+keybert = KeyBERTInspired()
+if low_resource_mode == "No":
+    # Use llama.cpp to load in model
+    llm = Llama(model_path=found_file, stop=chosen_start_tag, n_gpu_layers=n_gpu_layers, n_ctx=n_ctx) #**gpu_config.model_dump())#
+    #print(llm.n_gpu_layers)
+    llm_model = LlamaCPP(llm, prompt=chosen_prompt)#, **gen_config.model_dump())
+    # All representation models
+    representation_model = {
+    "KeyBERT": keybert,
+    "Mistral": llm_model
+    }
+elif low_resource_mode == "Yes":
+    representation_model = {"KeyBERT": keybert}
+# Deprecated example using CTransformers. This package is not really used anymore
+#model = AutoModelForCausalLM.from_pretrained('NousResearch/Nous-Capybara-7B-V1.9-GGUF', model_type='mistral', model_file='Capybara-7B-V1.9-Q5_K_M.gguf', hf=True, **vars(gpu_config))
+#tokenizer = AutoTokenizer.from_pretrained("NousResearch/Nous-Capybara-7B-V1.9")
+#generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
+# Text generation with Llama 2
+#mistral_capybara = TextGeneration(generator, prompt=capybara_prompt)
+#mistral_hermes = TextGeneration(generator, prompt=open_hermes_prompt)
+# MMR (is rubbish, don't use)
+#mmr = MaximalMarginalRelevance(diversity=0.3)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio==3.50.0
+transformers
+accelerate
+torch
+llama-cpp-python
+bertopic
+spacy
+pyarrow
+faker
+presidio_analyzer
+presidio_anonymizer