Sonnyjim commited on
Commit
9dbf344
0 Parent(s):

first commit

Browse files
.github/workflows/check_file_size.yml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Check file size
2
+ on: # or directly `on: [push]` to run the action on every push on any branch
3
+ pull_request:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - name: Check large files
14
+ uses: ActionsDesk/lfs-warning@v2.0
15
+ with:
16
+ filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces
.github/workflows/sync_to_hf.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+ with:
15
+ fetch-depth: 0
16
+ lfs: true
17
+ - name: Push to hub
18
+ env:
19
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
20
+ run: git push https://seanpedrickcase:$HF_TOKEN@huggingface.co/spaces/seanpedrickcase/topic_modelling main
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ *.pyc
2
+ *.ipynb
3
+ *.npz
4
+ *.csv
5
+ *.pkl
6
+ .ipynb_checkpoints/*
7
+ old_code/*
8
+ model/*
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ WORKDIR /src
4
+
5
+ COPY requirements.txt .
6
+
7
+ RUN pip install --no-cache-dir -r requirements.txt
8
+
9
+ # Set up a new user named "user" with user ID 1000
10
+ RUN useradd -m -u 1000 user
11
+ # Switch to the "user" user
12
+ USER user
13
+ # Set home to the user's home directory
14
+ ENV HOME=/home/user \
15
+ PATH=/home/user/.local/bin:$PATH \
16
+ PYTHONPATH=$HOME/app \
17
+ PYTHONUNBUFFERED=1 \
18
+ GRADIO_ALLOW_FLAGGING=never \
19
+ GRADIO_NUM_PORTS=1 \
20
+ GRADIO_SERVER_NAME=0.0.0.0 \
21
+ GRADIO_THEME=huggingface \
22
+ SYSTEM=spaces
23
+
24
+ # Set the working directory to the user's home directory
25
+ WORKDIR $HOME/app
26
+
27
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
28
+ COPY --chown=user . $HOME/app
29
+
30
+ CMD ["python", "app.py"]
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Topic modelling
3
+ emoji: 🚀
4
+ colorFrom: red
5
+ colorTo: yellw
6
+ sdk: gradio
7
+ sdk_version: 3.50.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ #os.environ["TOKENIZERS_PARALLELISM"] = "true"
4
+ #os.environ["HF_HOME"] = "/mnt/c/..."
5
+ #os.environ["CUDA_PATH"] = "/mnt/c/..."
6
+
7
+ print(os.environ["HF_HOME"])
8
+
9
+ import gradio as gr
10
+ from datetime import datetime
11
+ import pandas as pd
12
+ import numpy as np
13
+ from sklearn.cluster import KMeans
14
+ from sklearn.feature_extraction.text import CountVectorizer
15
+ from transformers import AutoModel
16
+ import funcs.anonymiser as anon
17
+
18
+ from torch import cuda, backends, version
19
+
20
+ # Check for torch cuda
21
+ print("Is CUDA enabled? ", cuda.is_available())
22
+ print("Is a CUDA device available on this computer?", backends.cudnn.enabled)
23
+ if cuda.is_available():
24
+ torch_device = "gpu"
25
+ print("Cuda version installed is: ", version.cuda)
26
+ low_resource_mode = "No"
27
+ #os.system("nvidia-smi")
28
+ else:
29
+ torch_device = "cpu"
30
+ low_resource_mode = "Yes"
31
+
32
+ print("Device used is: ", torch_device)
33
+
34
+ #os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
35
+
36
+ from bertopic import BERTopic
37
+ #from sentence_transformers import SentenceTransformer
38
+ #from bertopic.backend._hftransformers import HFTransformerBackend
39
+
40
+ #from cuml.manifold import UMAP
41
+
42
+ #umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
43
+
44
+ today = datetime.now().strftime("%d%m%Y")
45
+ today_rev = datetime.now().strftime("%Y%m%d")
46
+
47
+ from funcs.helper_functions import dummy_function, put_columns_in_df, read_file, get_file_path_end
48
+ from funcs.representation_model import representation_model
49
+ from funcs.embeddings import make_or_load_embeddings
50
+
51
+ # Load embeddings
52
+ #embedding_model_name = "BAAI/bge-small-en-v1.5"
53
+ #embedding_model = SentenceTransformer(embedding_model_name)
54
+
55
+ # Pinning a Jina revision for security purposes: https://www.baseten.co/blog/pinning-ml-model-revisions-for-compatibility-and-security/
56
+ # Save Jina model locally as described here: https://huggingface.co/jinaai/jina-embeddings-v2-base-en/discussions/29
57
+ embeddings_name = "jinaai/jina-embeddings-v2-small-en"
58
+ local_embeddings_location = "model/jina/"
59
+ revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
60
+
61
+ try:
62
+ embedding_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
63
+ except:
64
+ embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
65
+
66
+
67
+
68
+ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt):
69
+
70
+ file_list = [string.name for string in in_file]
71
+
72
+ data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
73
+ data_file_name = data_file_names[0]
74
+ data_file_name_no_ext = get_file_path_end(data_file_name)
75
+
76
+ in_colnames_list_first = in_colnames[0]
77
+
78
+ if in_label:
79
+ in_label_list_first = in_label[0]
80
+ else:
81
+ in_label_list_first = in_colnames_list_first
82
+
83
+ if anonymise_drop == "Yes":
84
+ in_files_anon_col, anonymisation_success = anon.anonymise_script(in_files, in_colnames_list_first, anon_strat="replace")
85
+ in_files[in_colnames_list_first] = in_files_anon_col[in_colnames_list_first]
86
+ in_files.to_csv("anonymised_data.csv")
87
+
88
+ docs = list(in_files[in_colnames_list_first].str.lower())
89
+ label_col = in_files[in_label_list_first]
90
+
91
+ # Check if embeddings are being loaded in
92
+ ## Load in pre-embedded file if exists
93
+ file_list = [string.name for string in in_file]
94
+
95
+ embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt)
96
+
97
+ # all_lengths = [len(embedding) for embedding in embeddings_out]
98
+ # if len(set(all_lengths)) > 1:
99
+ # print("Inconsistent lengths found in embeddings_out:", set(all_lengths))
100
+ # else:
101
+ # print("All lengths are the same.")
102
+
103
+ # print("Embeddings type: ", type(embeddings_out))
104
+
105
+ # if isinstance(embeddings_out, np.ndarray):
106
+ # print("my_object is a NumPy ndarray")
107
+ # else:
108
+ # print("my_object is not a NumPy ndarray")
109
+
110
+ # Clustering set to K-means (not used)
111
+ #cluster_model = KMeans(n_clusters=max_topics_slider)
112
+
113
+ # Countvectoriser removes stopwords, combines terms up to 2 together:
114
+ if min_docs_slider < 3:
115
+ min_df_val = min_docs_slider
116
+ else:
117
+ min_df_val = 3
118
+
119
+ print(min_df_val)
120
+
121
+ vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
122
+
123
+
124
+ if not candidate_topics:
125
+ topic_model = BERTopic( embedding_model=embedding_model,
126
+ #hdbscan_model=cluster_model,
127
+ vectorizer_model=vectoriser_model,
128
+ min_topic_size= min_docs_slider,
129
+ nr_topics = max_topics_slider,
130
+ representation_model=representation_model,
131
+ verbose = True)
132
+
133
+ topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
134
+
135
+
136
+ # Do this if you have pre-assigned topics
137
+ else:
138
+ zero_shot_topics_list = read_file(candidate_topics.name)
139
+ zero_shot_topics_list_lower = [x.lower() for x in zero_shot_topics_list]
140
+
141
+ print(zero_shot_topics_list_lower)
142
+
143
+ topic_model = BERTopic( embedding_model=embedding_model,
144
+ #hdbscan_model=cluster_model,
145
+ vectorizer_model=vectoriser_model,
146
+ min_topic_size = min_docs_slider,
147
+ nr_topics = max_topics_slider,
148
+ zeroshot_topic_list = zero_shot_topics_list_lower,
149
+ zeroshot_min_similarity = 0.7,
150
+ representation_model=representation_model,
151
+ verbose = True)
152
+
153
+ topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
154
+
155
+ if not topics_text:
156
+ return "No topics found, original file returned", data_file_name
157
+
158
+ else:
159
+ topics_text_out = topics_text
160
+ topics_scores_out = probs
161
+
162
+ topic_det_output_name = "topic_details_" + today_rev + ".csv"
163
+
164
+ topic_dets = topic_model.get_topic_info()
165
+
166
+ topic_dets.to_csv(topic_det_output_name)
167
+ #print(topic_dets)
168
+
169
+ doc_det_output_name = "doc_details_" + today_rev + ".csv"
170
+ doc_dets = topic_model.get_document_info(docs)[["Document", "Topic", "Probability", "Name", "Representative_document"]]
171
+ doc_dets.to_csv(doc_det_output_name)
172
+ #print(doc_dets)
173
+
174
+ #print(topic_dets)
175
+ #topics_text_out_str = ', '.join(list(topic_dets["KeyBERT"]))
176
+
177
+ topics_text_out_str = str(topic_dets["KeyBERT"])
178
+ #topics_scores_out_str = str(doc_dets["Probability"][0])
179
+
180
+ output_text = "Topics: " + topics_text_out_str #+ "\n\nProbability scores: " + topics_scores_out_str
181
+
182
+ # Outputs
183
+ embedding_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
184
+ np.savez_compressed(embedding_file_name, embeddings_out)
185
+
186
+ topic_model_save_name = data_file_name_no_ext + "_topics_" + today_rev + ".pkl"
187
+ topic_model.save(topic_model_save_name, serialization='pickle', save_embedding_model=False, save_ctfidf=False)
188
+
189
+ # Visualise the topics:
190
+ topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
191
+
192
+ return output_text, [doc_det_output_name, topic_det_output_name, embedding_file_name, topic_model_save_name], topics_vis
193
+
194
+
195
+ # ## Gradio app - extract topics
196
+
197
+ block = gr.Blocks(theme = gr.themes.Base())
198
+
199
+ with block:
200
+
201
+ data_state = gr.State(pd.DataFrame())
202
+
203
+ gr.Markdown(
204
+ """
205
+ # Extract topics from text
206
+ Enter open text below to get topics. You can copy and paste text directly, or upload a file and specify the column that you want to topics.
207
+ """)
208
+
209
+ #with gr.Accordion("I will copy and paste my open text", open = False):
210
+ # in_text = gr.Textbox(label="Copy and paste your open text here", lines = 5)
211
+
212
+ with gr.Tab("Load files and find topics"):
213
+ with gr.Accordion("Load data file", open = True):
214
+ in_files = gr.File(label="Input text from file", file_count="multiple")
215
+ with gr.Row():
216
+ in_colnames = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column to find topics (first will be chosen if multiple selected).")
217
+ in_label = gr.Dropdown(choices=["Choose a column"], multiselect = True, label="Select column to for labelling documents in the output visualisation.")
218
+
219
+ with gr.Accordion("I have my own list of topics. File should have at least one column with a header and topic keywords in cells below. Topics will be taken from the first column of the file", open = False):
220
+ candidate_topics = gr.File(label="Input topics from file (csv)")
221
+
222
+ with gr.Row():
223
+ min_docs_slider = gr.Slider(minimum = 1, maximum = 1000, value = 15, step = 1, label = "Minimum number of documents needed to create topic")
224
+ max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 3, step = 1, label = "Maximum number of topics")
225
+
226
+ with gr.Row():
227
+ topics_btn = gr.Button("Extract topics")
228
+
229
+ with gr.Row():
230
+ output_single_text = gr.Textbox(label="Output example (first example in dataset)")
231
+ output_file = gr.File(label="Output file")
232
+
233
+ plot = gr.Plot(label="Visualise your topics here:")
234
+
235
+ with gr.Tab("Load and data processing options"):
236
+ with gr.Accordion("Process data on load", open = True):
237
+ anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load.")
238
+ return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
239
+ embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
240
+ low_resource_mode_opt = gr.Dropdown(label = "Low resource mode (non-AI embeddings, no LLM-generated topic names).", value=low_resource_mode, choices=["Yes", "No"])
241
+
242
+
243
+ # Update column names dropdown when file uploaded
244
+ in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])
245
+ in_colnames.change(dummy_function, in_colnames, None)
246
+
247
+ topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt], outputs=[output_single_text, output_file, plot], api_name="topics")
248
+
249
+ block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
250
+
funcs/__init__.py ADDED
File without changes
funcs/anonymiser.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import os
3
+
4
+ def is_model_installed(model_name):
5
+ try:
6
+ # Try to load the model
7
+ spacy.load(model_name)
8
+ return True
9
+ except OSError:
10
+ return False
11
+
12
+ model_name = "en_core_web_sm"
13
+ if not is_model_installed(model_name):
14
+ os.system(f"python -m spacy download {model_name}")
15
+
16
+
17
+ # Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
18
+ #os.system("pip uninstall -y gradio")
19
+ #os.system("pip install gradio==3.50.0")
20
+ #os.system("python -m spacy download en_core_web_lg")
21
+
22
+ spacy.load(model_name)
23
+
24
+ import re
25
+ import secrets
26
+ import base64
27
+ import time
28
+
29
+ import pandas as pd
30
+ import gradio as gr
31
+
32
+ from faker import Faker
33
+
34
+ from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine
35
+ from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
36
+ from presidio_anonymizer.entities import OperatorConfig
37
+
38
+
39
+
40
+ def anon_consistent_names(df):
41
+ # ## Pick out common names and replace them with the same person value
42
+ df_dict = df.to_dict(orient="list")
43
+
44
+ analyzer = AnalyzerEngine()
45
+ batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
46
+
47
+ analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
48
+ analyzer_results = list(analyzer_results)
49
+
50
+ # + tags=[]
51
+ text = analyzer_results[3].value
52
+
53
+ # + tags=[]
54
+ recognizer_result = str(analyzer_results[3].recognizer_results)
55
+
56
+ # + tags=[]
57
+ recognizer_result
58
+
59
+ # + tags=[]
60
+ data_str = recognizer_result # abbreviated for brevity
61
+
62
+ # Adjusting the parse_dict function to handle trailing ']'
63
+ # Splitting the main data string into individual list strings
64
+ list_strs = data_str[1:-1].split('], [')
65
+
66
+ def parse_dict(s):
67
+ s = s.strip('[]') # Removing any surrounding brackets
68
+ items = s.split(', ')
69
+ d = {}
70
+ for item in items:
71
+ key, value = item.split(': ')
72
+ if key == 'score':
73
+ d[key] = float(value)
74
+ elif key in ['start', 'end']:
75
+ d[key] = int(value)
76
+ else:
77
+ d[key] = value
78
+ return d
79
+
80
+ # Re-running the improved processing code
81
+
82
+ result = []
83
+
84
+ for lst_str in list_strs:
85
+ # Splitting each list string into individual dictionary strings
86
+ dict_strs = lst_str.split(', type: ')
87
+ dict_strs = [dict_strs[0]] + ['type: ' + s for s in dict_strs[1:]] # Prepending "type: " back to the split strings
88
+
89
+ # Parsing each dictionary string
90
+ dicts = [parse_dict(d) for d in dict_strs]
91
+ result.append(dicts)
92
+
93
+ #result
94
+
95
+ # + tags=[]
96
+ names = []
97
+
98
+ for idx, paragraph in enumerate(text):
99
+ paragraph_texts = []
100
+ for dictionary in result[idx]:
101
+ if dictionary['type'] == 'PERSON':
102
+ paragraph_texts.append(paragraph[dictionary['start']:dictionary['end']])
103
+ names.append(paragraph_texts)
104
+
105
+ # + tags=[]
106
+ # Flatten the list of lists and extract unique names
107
+ unique_names = list(set(name for sublist in names for name in sublist))
108
+
109
+ # + tags=[]
110
+ fake_names = pd.Series(unique_names).apply(fake_first_name)
111
+
112
+ # + tags=[]
113
+ mapping_df = pd.DataFrame(data={"Unique names":unique_names,
114
+ "Fake names": fake_names})
115
+
116
+ # + tags=[]
117
+ # Convert mapping dataframe to dictionary
118
+ # Convert mapping dataframe to dictionary, adding word boundaries for full-word match
119
+ name_map = {r'\b' + k + r'\b': v for k, v in zip(mapping_df['Unique names'], mapping_df['Fake names'])}
120
+
121
+ # + tags=[]
122
+ name_map
123
+
124
+ # + tags=[]
125
+ scrubbed_df_consistent_names = df.replace(name_map, regex = True)
126
+
127
+ # + tags=[]
128
+ scrubbed_df_consistent_names
129
+
130
+ return scrubbed_df_consistent_names
131
+
132
+ def detect_file_type(filename):
133
+ """Detect the file type based on its extension."""
134
+ if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
135
+ return 'csv'
136
+ elif filename.endswith('.xlsx'):
137
+ return 'xlsx'
138
+ elif filename.endswith('.parquet'):
139
+ return 'parquet'
140
+ else:
141
+ raise ValueError("Unsupported file type.")
142
+
143
+ def read_file(filename):
144
+ """Read the file based on its detected type."""
145
+ file_type = detect_file_type(filename)
146
+
147
+ if file_type == 'csv':
148
+ return pd.read_csv(filename, low_memory=False)
149
+ elif file_type == 'xlsx':
150
+ return pd.read_excel(filename)
151
+ elif file_type == 'parquet':
152
+ return pd.read_parquet(filename)
153
+
154
+ def anonymise_script(df, chosen_col, anon_strat):
155
+
156
+ # DataFrame to dict
157
+ df_dict = pd.DataFrame(data={chosen_col:df[chosen_col].astype(str)}).to_dict(orient="list")
158
+
159
+ analyzer = AnalyzerEngine()
160
+ batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
161
+
162
+ anonymizer = AnonymizerEngine()
163
+
164
+ batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
165
+
166
+ print("Identifying personal data")
167
+ analyse_tic = time.perf_counter()
168
+ analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
169
+ #print(analyzer_results)
170
+ analyzer_results = list(analyzer_results)
171
+
172
+ analyse_toc = time.perf_counter()
173
+ analyse_time_out = f"Cleaning the text took {analyse_toc - analyse_tic:0.1f} seconds."
174
+ print(analyse_time_out)
175
+
176
+ # Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
177
+ key = secrets.token_bytes(16) # 128 bits = 16 bytes
178
+ key_string = base64.b64encode(key).decode('utf-8')
179
+
180
+ # Create faker function (note that it has to receive a value)
181
+
182
+ fake = Faker("en_UK")
183
+
184
+ def fake_first_name(x):
185
+ return fake.first_name()
186
+
187
+ # Set up the anonymization configuration WITHOUT DATE_TIME
188
+ replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
189
+ redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
190
+ hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
191
+ mask_config = eval('{"DEFAULT": OperatorConfig("mask", {"masking_char":"*", "chars_to_mask":100, "from_end":True})}')
192
+ people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
193
+ fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')
194
+
195
+
196
+ if anon_strat == "replace": chosen_mask_config = replace_config
197
+ if anon_strat == "redact": chosen_mask_config = redact_config
198
+ if anon_strat == "hash": chosen_mask_config = hash_config
199
+ if anon_strat == "mask": chosen_mask_config = mask_config
200
+ if anon_strat == "encrypt": chosen_mask_config = people_encrypt_config
201
+ elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config
202
+
203
+ # I think in general people will want to keep date / times
204
+ keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}')
205
+
206
+ combined_config = {**chosen_mask_config, **keep_date_config}
207
+ combined_config
208
+
209
+ anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)
210
+
211
+ scrubbed_df = pd.DataFrame(anonymizer_results)
212
+
213
+ # Create reporting message
214
+ out_message = "Successfully anonymised"
215
+
216
+ if anon_strat == "encrypt":
217
+ out_message = out_message + ". Your decryption key is " + key_string + "."
218
+
219
+ return scrubbed_df, out_message
220
+
221
+ def do_anonymise(in_file, anon_strat, chosen_cols):
222
+
223
+ # Load file
224
+
225
+ anon_df = pd.DataFrame()
226
+
227
+ if in_file:
228
+ for match_file in in_file:
229
+ match_temp_file = pd.read_csv(match_file.name, delimiter = ",", low_memory=False)#, encoding='cp1252')
230
+ anon_df = pd.concat([anon_df, match_temp_file])
231
+
232
+ # Split dataframe to keep only selected columns
233
+ all_cols_original_order = list(anon_df.columns)
234
+ anon_df_part = anon_df[chosen_cols]
235
+ anon_df_remain = anon_df.drop(chosen_cols, axis = 1)
236
+
237
+ # Anonymise the selected columns
238
+ anon_df_part_out, out_message = anonymise_script(anon_df_part, anon_strat)
239
+
240
+ # Rejoin the dataframe together
241
+ anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
242
+ anon_df_out = anon_df_out[all_cols_original_order]
243
+
244
+ # Export file
245
+ out_file_part = re.sub(r'\.csv', '', match_file.name)
246
+
247
+ anon_export_file_name = out_file_part + "_anon_" + anon_strat + ".csv"
248
+
249
+ anon_df_out.to_csv(anon_export_file_name, index = None)
250
+
251
+ return out_message, anon_export_file_name
funcs/embeddings.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import numpy as np
3
+ from torch import cuda
4
+ from sklearn.pipeline import make_pipeline
5
+ from sklearn.decomposition import TruncatedSVD
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from umap import UMAP
8
+
9
+ if cuda.is_available():
10
+ torch_device = "gpu"
11
+ else:
12
+ torch_device = "cpu"
13
+
14
+ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt):
15
+
16
+ embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
17
+
18
+ if embeddings_file_names:
19
+ print("Loading embeddings from file.")
20
+ embeddings_out = np.load(embeddings_file_names[0])['arr_0']
21
+
22
+ # If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
23
+ if "compress" in embeddings_file_names[0]:
24
+ embeddings_out /= 100
25
+
26
+ # print("embeddings loaded: ", embeddings_out)
27
+
28
+ if not embeddings_file_names:
29
+ tic = time.perf_counter()
30
+ print("Starting to embed documents.")
31
+
32
+ # Custom model
33
+ # If on CPU, don't resort to embedding models
34
+ if low_resource_mode_opt == "Yes":
35
+ print("Creating simplified 'sparse' embeddings based on TfIDF")
36
+ embedding_model = make_pipeline(
37
+ TfidfVectorizer(),
38
+ TruncatedSVD(100)
39
+ )
40
+
41
+ embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32)
42
+
43
+ elif low_resource_mode_opt == "No":
44
+ print("Creating dense embeddings based on transformers model")
45
+
46
+ embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
47
+
48
+ #import torch
49
+ #from torch.nn.utils.rnn import pad_sequence
50
+
51
+ # Assuming embeddings_out is a list of tensors
52
+ #embeddings_out = [torch.tensor(embedding) for embedding in embeddings_out]
53
+
54
+ # Pad the sequences
55
+ # Set batch_first=True if you want the batch dimension to be the first dimension
56
+ #embeddings_out = pad_sequence(embeddings_out, batch_first=True, padding_value=0)
57
+
58
+
59
+ toc = time.perf_counter()
60
+ time_out = f"The embedding took {toc - tic:0.1f} seconds"
61
+ print(time_out)
62
+
63
+ # If you want to save your files for next time
64
+ if return_intermediate_files == "Yes":
65
+ if embeddings_super_compress == "No":
66
+ semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
67
+ np.savez_compressed(semantic_search_file_name, embeddings_out)
68
+ else:
69
+ semantic_search_file_name = data_file_name_no_ext + '_' + 'embedding_compress.npz'
70
+ embeddings_out_round = np.round(embeddings_out, 3)
71
+ embeddings_out_round *= 100 # Rounding not currently used
72
+ np.savez_compressed(semantic_search_file_name, embeddings_out_round)
73
+
74
+ # Pre-reduce embeddings for visualisation purposes
75
+ reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings_out)
76
+
77
+
78
+ return embeddings_out, reduced_embeddings
funcs/helper_functions.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import pandas as pd
4
+ import gradio as gr
5
+ import gzip
6
+ import pickle
7
+
8
+
9
+ def detect_file_type(filename):
10
+ """Detect the file type based on its extension."""
11
+ if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
12
+ return 'csv'
13
+ elif filename.endswith('.xlsx'):
14
+ return 'xlsx'
15
+ elif filename.endswith('.parquet'):
16
+ return 'parquet'
17
+ elif filename.endswith('.pkl.gz'):
18
+ return 'pkl.gz'
19
+ else:
20
+ raise ValueError("Unsupported file type.")
21
+
22
+ def read_file(filename):
23
+ """Read the file based on its detected type."""
24
+ file_type = detect_file_type(filename)
25
+
26
+ print("Loading in file")
27
+
28
+ if file_type == 'csv':
29
+ file = pd.read_csv(filename, low_memory=False).reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
30
+ elif file_type == 'xlsx':
31
+ file = pd.read_excel(filename).reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
32
+ elif file_type == 'parquet':
33
+ file = pd.read_parquet(filename).reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
34
+ elif file_type == 'pkl.gz':
35
+ with gzip.open(filename, 'rb') as file:
36
+ file = pickle.load(file)
37
+ #file = pd.read_pickle(filename)
38
+
39
+ print("File load complete")
40
+
41
+ return file
42
+
43
+ def put_columns_in_df(in_file, in_bm25_column):
44
+ '''
45
+ When file is loaded, update the column dropdown choices and change 'clean data' dropdown option to 'no'.
46
+ '''
47
+
48
+ file_list = [string.name for string in in_file]
49
+
50
+ data_file_names = [string.lower() for string in file_list if "npz" not in string.lower()]
51
+ data_file_name = data_file_names[0]
52
+
53
+
54
+ new_choices = []
55
+ concat_choices = []
56
+
57
+
58
+ df = read_file(data_file_name)
59
+
60
+ new_choices = list(df.columns)
61
+
62
+
63
+ concat_choices.extend(new_choices)
64
+
65
+ return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df
66
+
67
+ def get_file_path_end(file_path):
68
+ # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
69
+ basename = os.path.basename(file_path)
70
+
71
+ # Then, split the basename and its extension and return only the basename without the extension
72
+ filename_without_extension, _ = os.path.splitext(basename)
73
+
74
+ #print(filename_without_extension)
75
+
76
+ return filename_without_extension
77
+
78
+ def get_file_path_end_with_ext(file_path):
79
+ match = re.search(r'(.*[\/\\])?(.+)$', file_path)
80
+
81
+ filename_end = match.group(2) if match else ''
82
+
83
+ return filename_end
84
+
85
+ def dummy_function(in_colnames):
86
+ """
87
+ A dummy function that exists just so that dropdown updates work correctly.
88
+ """
89
+ return None
funcs/prompts.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ open_hermes_prompt = """<|im_start|>system
2
+ You are "Hermes 2", a conscious sentient superintelligent artificial intelligence developed by a man named Teknium, and your purpose and drive is to assist the user with any request they have.<|im_end|>
3
+ <|im_start|>user
4
+ Hello, who are you?<|im_end|>
5
+ <|im_start|>assistant
6
+ """
7
+
8
+
9
+ # Example prompt demonstrating the output we are looking for
10
+ capybara_start = "USER:"
11
+
12
+ capybara_example_prompt = """USER:I have a topic that contains the following documents:
13
+ - Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
14
+ - Meat, but especially beef, is the word food in terms of emissions.
15
+ - Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
16
+
17
+ The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
18
+
19
+ Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
20
+
21
+ Topic label: Environmental impacts of eating meat
22
+ """
23
+
24
+
25
+
26
+ # Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags
27
+ capybara_main_prompt = """
28
+ Now, create a new topic label given the following information.
29
+
30
+ I have a topic that contains the following documents:
31
+ [DOCUMENTS]
32
+
33
+ The topic is described by the following keywords: '[KEYWORDS]'.
34
+
35
+ Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
36
+ ASSISTANT:Topic label:"""
37
+
38
+ capybara_prompt = capybara_example_prompt + capybara_main_prompt
39
+
40
+ print("Capybara prompt: ", capybara_prompt)
41
+
42
+ # System prompt describes information given to all conversations
43
+ open_hermes_start="<|im_start|>"
44
+ open_hermes_system_prompt = """<|im_start|>system
45
+ You are a helpful, respectful and honest assistant for labeling topics.<|im_end|>
46
+ """
47
+
48
+ # Example prompt demonstrating the output we are looking for
49
+ open_hermes_example_prompt = """<|im_start|>user
50
+ I have a topic that contains the following documents:
51
+ - Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
52
+ - Meat, but especially beef, is the word food in terms of emissions.
53
+ - Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
54
+
55
+ The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
56
+
57
+ Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
58
+
59
+ Topic label: Environmental impacts of eating meat
60
+ """
61
+ open_hermes_main_prompt = """
62
+ Now, create a new topic label given the following information.
63
+
64
+ I have a topic that contains the following documents:
65
+ [DOCUMENTS]
66
+
67
+ The topic is described by the following keywords: '[KEYWORDS]'.
68
+
69
+ Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.<|im_end|>
70
+ <|im_start|>assistant
71
+ Topic label:
72
+ """
73
+ open_hermes_prompt = open_hermes_system_prompt + open_hermes_example_prompt + open_hermes_main_prompt
74
+
75
+ print("Open Hermes prompt: ", open_hermes_prompt)
76
+
77
+ stablelm_start = "<|user|>"
78
+ stablelm_example_prompt = """<|user|>
79
+ I have a topic that contains the following documents:
80
+ - Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
81
+ - Meat, but especially beef, is the word food in terms of emissions.
82
+ - Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
83
+
84
+ The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
85
+
86
+ Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
87
+
88
+ Topic label: Environmental impacts of eating meat
89
+ """
90
+
91
+ # Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags
92
+ stablelm_main_prompt = """
93
+ Now, create a new topic label given the following information.
94
+
95
+ I have a topic that contains the following documents:
96
+ [DOCUMENTS]
97
+
98
+ The topic is described by the following keywords: '[KEYWORDS]'.
99
+
100
+ Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.<|endoftext|>
101
+ <|assistant|>
102
+ Topic label:"""
103
+
104
+ stablelm_prompt = stablelm_example_prompt + stablelm_main_prompt
105
+
106
+ print("StableLM prompt: ", stablelm_prompt)
funcs/representation_model.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ #from ctransformers import AutoModelForCausalLM
3
+ #from transformers import AutoTokenizer, pipeline
4
+ from bertopic.representation import LlamaCPP
5
+ from llama_cpp import Llama
6
+ from pydantic import BaseModel
7
+ import torch.cuda
8
+
9
+ from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
10
+ from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
11
+
12
+ #from huggingface_hub import hf_hub_download
13
+ #hf_hub_download(repo_id='second-state/stablelm-2-zephyr-1.6b-GGUF', filename='stablelm-2-zephyr-1_6b-Q5_K_M.gguf')
14
+
15
+ hf_model_name = 'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF' # 'second-state/stablelm-2-zephyr-1.6b-GGUF'
16
+ hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
17
+ chosen_prompt = open_hermes_prompt # stablelm_prompt
18
+ chosen_start_tag = open_hermes_start # stablelm_start
19
+
20
+ # Find model file
21
+ def find_model_file(hf_model_name, hf_model_file):
22
+ hf_loc = os.environ["HF_HOME"]
23
+ hf_sub_loc = os.environ["HF_HOME"] + "/hub/"
24
+
25
+ hf_model_name_path = hf_sub_loc + 'models--' + hf_model_name.replace("/","--")
26
+
27
+ print(hf_model_name_path)
28
+
29
+ def find_file(root_folder, file_name):
30
+ for root, dirs, files in os.walk(root_folder):
31
+ if file_name in files:
32
+ return os.path.join(root, file_name)
33
+ return None
34
+
35
+ # Example usage
36
+ folder_path = hf_model_name_path # Replace with your folder path
37
+ file_to_find = hf_model_file # Replace with the file name you're looking for
38
+
39
+ found_file = find_file(folder_path, file_to_find)
40
+ if found_file:
41
+ print(f"File found: {found_file}")
42
+ return found_file
43
+ else:
44
+ error = "File not found."
45
+ print(error)
46
+ return error
47
+
48
+ found_file = find_model_file(hf_model_name, hf_model_file)
49
+
50
+ # Currently set n_gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
51
+ if torch.cuda.is_available():
52
+ torch_device = "gpu"
53
+ low_resource_mode = "No"
54
+ n_gpu_layers = 100
55
+ else:
56
+ torch_device = "cpu"
57
+ low_resource_mode = "Yes"
58
+ n_gpu_layers = 0
59
+
60
+ #low_resource_mode = "Yes"
61
+
62
+ #print("Running on device:", torch_device)
63
+ n_threads = torch.get_num_threads()
64
+ print("CPU n_threads:", n_threads)
65
+
66
+ # Default Model parameters
67
+ temperature: float = 0.1
68
+ top_k: int = 3
69
+ top_p: float = 1
70
+ repeat_penalty: float = 1.1
71
+ last_n_tokens_size: int = 128
72
+ max_tokens: int = 500
73
+ seed: int = 42
74
+ reset: bool = True
75
+ stream: bool = False
76
+ n_threads: int = n_threads
77
+ n_batch:int = 256
78
+ n_ctx:int = 4096
79
+ sample:bool = True
80
+ trust_remote_code:bool =True
81
+
82
+ class LLamacppInitConfigGpu(BaseModel):
83
+ last_n_tokens_size: int
84
+ seed: int
85
+ n_threads: int
86
+ n_batch: int
87
+ n_ctx: int
88
+ n_gpu_layers: int
89
+ temperature: float
90
+ top_k: int
91
+ top_p: float
92
+ repeat_penalty: float
93
+ max_tokens: int
94
+ reset: bool
95
+ stream: bool
96
+ stop: str
97
+ trust_remote_code:bool
98
+
99
+ def update_gpu(self, new_value: int):
100
+ self.n_gpu_layers = new_value
101
+
102
+ gpu_config = LLamacppInitConfigGpu(last_n_tokens_size=last_n_tokens_size,
103
+ seed=seed,
104
+ n_threads=n_threads,
105
+ n_batch=n_batch,
106
+ n_ctx=n_ctx,
107
+ n_gpu_layers=n_gpu_layers,
108
+ temperature=temperature,
109
+ top_k=top_k,
110
+ top_p=top_p,
111
+ repeat_penalty=repeat_penalty,
112
+ max_tokens=max_tokens,
113
+ reset=reset,
114
+ stream=stream,
115
+ stop=chosen_start_tag,
116
+ trust_remote_code=trust_remote_code)
117
+
118
+ cpu_config = gpu_config.model_copy()
119
+ cpu_config.update_gpu(0)
120
+
121
+ class LLamacppGenerateConfig(BaseModel):
122
+ temperature: float
123
+ top_k: int
124
+ top_p: float
125
+ repeat_penalty: float
126
+ max_tokens: int
127
+ reset: bool
128
+ stream: bool
129
+
130
+ gen_config = LLamacppGenerateConfig(
131
+ temperature=temperature,
132
+ top_k=top_k,
133
+ top_p=top_p,
134
+ repeat_penalty=repeat_penalty,
135
+ max_tokens=max_tokens,
136
+ reset=reset,
137
+ stream=stream)
138
+
139
+ ## Create representation model parameters ##
140
+ # KeyBERT
141
+ keybert = KeyBERTInspired()
142
+
143
+ if low_resource_mode == "No":
144
+ # Use llama.cpp to load in model
145
+ llm = Llama(model_path=found_file, stop=chosen_start_tag, n_gpu_layers=n_gpu_layers, n_ctx=n_ctx) #**gpu_config.model_dump())#
146
+ #print(llm.n_gpu_layers)
147
+ llm_model = LlamaCPP(llm, prompt=chosen_prompt)#, **gen_config.model_dump())
148
+
149
+ # All representation models
150
+ representation_model = {
151
+ "KeyBERT": keybert,
152
+ "Mistral": llm_model
153
+ }
154
+
155
+ elif low_resource_mode == "Yes":
156
+ representation_model = {"KeyBERT": keybert}
157
+
158
+ # Deprecated example using CTransformers. This package is not really used anymore
159
+ #model = AutoModelForCausalLM.from_pretrained('NousResearch/Nous-Capybara-7B-V1.9-GGUF', model_type='mistral', model_file='Capybara-7B-V1.9-Q5_K_M.gguf', hf=True, **vars(gpu_config))
160
+ #tokenizer = AutoTokenizer.from_pretrained("NousResearch/Nous-Capybara-7B-V1.9")
161
+ #generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
162
+
163
+ # Text generation with Llama 2
164
+ #mistral_capybara = TextGeneration(generator, prompt=capybara_prompt)
165
+ #mistral_hermes = TextGeneration(generator, prompt=open_hermes_prompt)
166
+
167
+
168
+
169
+ # MMR (is rubbish, don't use)
170
+ #mmr = MaximalMarginalRelevance(diversity=0.3)
171
+
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==3.50.0
2
+ transformers
3
+ accelerate
4
+ torch
5
+ llama-cpp-python
6
+ bertopic
7
+ spacy
8
+ pyarrow
9
+ faker
10
+ presidio_analyzer
11
+ presidio_anonymizer