Spaces:
Running
Running
Sean-Case
commited on
Commit
•
4cfed8e
1
Parent(s):
72f2310
Model export changed to safetensors. Improved representational model function. Got zero shot topic modelling working
Browse files- .gitignore +5 -1
- app.py +118 -55
- funcs/embeddings.py +14 -5
- funcs/helper_functions.py +33 -0
- funcs/prompts.py +3 -3
- funcs/representation_model.py +33 -32
.gitignore
CHANGED
@@ -4,6 +4,10 @@
|
|
4 |
*.csv
|
5 |
*.pkl
|
6 |
*.parquet
|
|
|
|
|
|
|
7 |
.ipynb_checkpoints/*
|
8 |
old_code/*
|
9 |
-
model/*
|
|
|
|
4 |
*.csv
|
5 |
*.pkl
|
6 |
*.parquet
|
7 |
+
*.png
|
8 |
+
*.safetensors
|
9 |
+
*.json
|
10 |
.ipynb_checkpoints/*
|
11 |
old_code/*
|
12 |
+
model/*
|
13 |
+
output_model/*
|
app.py
CHANGED
@@ -1,17 +1,14 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
-
#os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
4 |
-
#os.environ["HF_HOME"] = "/mnt/c/..."
|
5 |
-
#os.environ["CUDA_PATH"] = "/mnt/c/..."
|
6 |
-
#print(os.environ["HF_HOME"])
|
7 |
-
|
8 |
import gradio as gr
|
9 |
from datetime import datetime
|
10 |
import pandas as pd
|
11 |
import numpy as np
|
12 |
from sklearn.cluster import KMeans
|
13 |
from sklearn.feature_extraction.text import CountVectorizer
|
14 |
-
from transformers import AutoModel
|
|
|
|
|
|
|
|
|
15 |
import funcs.anonymiser as anon
|
16 |
|
17 |
from torch import cuda, backends, version
|
@@ -43,10 +40,11 @@ from bertopic import BERTopic
|
|
43 |
today = datetime.now().strftime("%d%m%Y")
|
44 |
today_rev = datetime.now().strftime("%Y%m%d")
|
45 |
|
46 |
-
from funcs.helper_functions import dummy_function, put_columns_in_df, read_file, get_file_path_end
|
47 |
-
from funcs.representation_model import representation_model
|
48 |
from funcs.embeddings import make_or_load_embeddings
|
49 |
|
|
|
50 |
# Load embeddings
|
51 |
#embedding_model_name = "BAAI/bge-small-en-v1.5"
|
52 |
#embedding_model = SentenceTransformer(embedding_model_name)
|
@@ -57,14 +55,24 @@ embeddings_name = "jinaai/jina-embeddings-v2-small-en"
|
|
57 |
local_embeddings_location = "model/jina/"
|
58 |
revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
64 |
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
|
67 |
-
def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress,
|
68 |
|
69 |
file_list = [string.name for string in in_file]
|
70 |
|
@@ -91,7 +99,28 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
91 |
## Load in pre-embedded file if exists
|
92 |
file_list = [string.name for string in in_file]
|
93 |
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
# all_lengths = [len(embedding) for embedding in embeddings_out]
|
97 |
# if len(set(all_lengths)) > 1:
|
@@ -110,18 +139,24 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
110 |
#cluster_model = KMeans(n_clusters=max_topics_slider)
|
111 |
|
112 |
# Countvectoriser removes stopwords, combines terms up to 2 together:
|
113 |
-
if min_docs_slider < 3:
|
114 |
-
|
115 |
-
else:
|
116 |
-
|
117 |
|
118 |
-
print(min_df_val)
|
119 |
|
120 |
vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
|
121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
if not candidate_topics:
|
124 |
-
topic_model = BERTopic( embedding_model=
|
125 |
#hdbscan_model=cluster_model,
|
126 |
vectorizer_model=vectoriser_model,
|
127 |
min_topic_size= min_docs_slider,
|
@@ -134,17 +169,18 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
134 |
|
135 |
# Do this if you have pre-assigned topics
|
136 |
else:
|
137 |
-
|
138 |
-
|
|
|
139 |
|
140 |
-
print(
|
141 |
|
142 |
-
topic_model = BERTopic( embedding_model=
|
143 |
#hdbscan_model=cluster_model,
|
144 |
vectorizer_model=vectoriser_model,
|
145 |
min_topic_size = min_docs_slider,
|
146 |
nr_topics = max_topics_slider,
|
147 |
-
zeroshot_topic_list =
|
148 |
zeroshot_min_similarity = 0.7,
|
149 |
representation_model=representation_model,
|
150 |
verbose = True)
|
@@ -152,44 +188,69 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
152 |
topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
|
153 |
|
154 |
if not topics_text:
|
155 |
-
return "No topics found, original file returned", data_file_name
|
156 |
|
157 |
else:
|
158 |
-
|
159 |
-
topics_scores_out = probs
|
160 |
-
|
161 |
-
topic_det_output_name = "topic_details_" + today_rev + ".csv"
|
162 |
|
163 |
topic_dets = topic_model.get_topic_info()
|
|
|
164 |
|
165 |
-
topic_dets.
|
166 |
-
|
|
|
167 |
|
168 |
-
|
169 |
-
doc_dets = topic_model.get_document_info(docs)[["Document", "Topic", "Probability", "Name", "Representative_document"]]
|
170 |
-
doc_dets.to_csv(doc_det_output_name)
|
171 |
-
#print(doc_dets)
|
172 |
-
|
173 |
-
#print(topic_dets)
|
174 |
-
#topics_text_out_str = ', '.join(list(topic_dets["KeyBERT"]))
|
175 |
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
|
|
|
|
180 |
|
181 |
# Outputs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
embedding_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
|
183 |
np.savez_compressed(embedding_file_name, embeddings_out)
|
184 |
|
185 |
-
|
186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
|
188 |
# Visualise the topics:
|
189 |
topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
|
191 |
-
return output_text, [doc_det_output_name, topic_det_output_name, embedding_file_name
|
|
|
|
|
192 |
|
|
|
193 |
|
194 |
# ## Gradio app - extract topics
|
195 |
|
@@ -219,7 +280,7 @@ with block:
|
|
219 |
candidate_topics = gr.File(label="Input topics from file (csv)")
|
220 |
|
221 |
with gr.Row():
|
222 |
-
min_docs_slider = gr.Slider(minimum =
|
223 |
max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 3, step = 1, label = "Maximum number of topics")
|
224 |
|
225 |
with gr.Row():
|
@@ -233,17 +294,19 @@ with block:
|
|
233 |
|
234 |
with gr.Tab("Load and data processing options"):
|
235 |
with gr.Accordion("Process data on load", open = True):
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
|
|
|
|
241 |
|
242 |
# Update column names dropdown when file uploaded
|
243 |
in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])
|
244 |
in_colnames.change(dummy_function, in_colnames, None)
|
245 |
|
246 |
-
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt], outputs=[output_single_text, output_file, plot], api_name="topics")
|
247 |
|
248 |
block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
|
249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
from datetime import datetime
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
5 |
from sklearn.cluster import KMeans
|
6 |
from sklearn.feature_extraction.text import CountVectorizer
|
7 |
+
from transformers import AutoModel, AutoTokenizer
|
8 |
+
from transformers.pipelines import pipeline
|
9 |
+
from sklearn.pipeline import make_pipeline
|
10 |
+
from sklearn.decomposition import TruncatedSVD
|
11 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
12 |
import funcs.anonymiser as anon
|
13 |
|
14 |
from torch import cuda, backends, version
|
|
|
40 |
today = datetime.now().strftime("%d%m%Y")
|
41 |
today_rev = datetime.now().strftime("%Y%m%d")
|
42 |
|
43 |
+
from funcs.helper_functions import dummy_function, put_columns_in_df, read_file, get_file_path_end, zip_folder, delete_files_in_folder
|
44 |
+
#from funcs.representation_model import representation_model
|
45 |
from funcs.embeddings import make_or_load_embeddings
|
46 |
|
47 |
+
|
48 |
# Load embeddings
|
49 |
#embedding_model_name = "BAAI/bge-small-en-v1.5"
|
50 |
#embedding_model = SentenceTransformer(embedding_model_name)
|
|
|
55 |
local_embeddings_location = "model/jina/"
|
56 |
revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
|
57 |
|
58 |
+
if low_resource_mode == "No":
|
59 |
+
try:
|
60 |
+
embedding_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
|
61 |
+
except:
|
62 |
+
embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
|
63 |
+
|
64 |
+
tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-small-en")
|
65 |
+
|
66 |
+
embedding_model_pipe = pipeline("feature-extraction", model=embedding_model, tokenizer=tokenizer)
|
67 |
|
68 |
+
elif low_resource_mode == "Yes":
|
69 |
+
embedding_model_pipe = make_pipeline(
|
70 |
+
TfidfVectorizer(),
|
71 |
+
TruncatedSVD(2) # 100 # set to 2 to be compatible with zero shot topics - can't be higher than number of topics
|
72 |
+
)
|
73 |
|
74 |
|
75 |
+
def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels):
|
76 |
|
77 |
file_list = [string.name for string in in_file]
|
78 |
|
|
|
99 |
## Load in pre-embedded file if exists
|
100 |
file_list = [string.name for string in in_file]
|
101 |
|
102 |
+
print("Low resource mode: ", low_resource_mode)
|
103 |
+
|
104 |
+
if low_resource_mode == "No":
|
105 |
+
print("Choosing high resource Jina transformer model")
|
106 |
+
try:
|
107 |
+
embedding_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
|
108 |
+
except:
|
109 |
+
embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
|
110 |
+
|
111 |
+
tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-small-en")
|
112 |
+
|
113 |
+
embedding_model_pipe = pipeline("feature-extraction", model=embedding_model, tokenizer=tokenizer)
|
114 |
+
|
115 |
+
elif low_resource_mode == "Yes":
|
116 |
+
print("Choosing low resource TfIDF model")
|
117 |
+
embedding_model_pipe = make_pipeline(
|
118 |
+
TfidfVectorizer(),
|
119 |
+
TruncatedSVD(2) # 100 # To be compatible with zero shot, this needs to be lower than number of suggested topics
|
120 |
+
)
|
121 |
+
embedding_model = embedding_model_pipe
|
122 |
+
|
123 |
+
embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
|
124 |
|
125 |
# all_lengths = [len(embedding) for embedding in embeddings_out]
|
126 |
# if len(set(all_lengths)) > 1:
|
|
|
139 |
#cluster_model = KMeans(n_clusters=max_topics_slider)
|
140 |
|
141 |
# Countvectoriser removes stopwords, combines terms up to 2 together:
|
142 |
+
#if min_docs_slider < 3:
|
143 |
+
# min_df_val = min_docs_slider
|
144 |
+
#else:
|
145 |
+
# min_df_val = 3
|
146 |
|
147 |
+
#print(min_df_val)
|
148 |
|
149 |
vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
|
150 |
|
151 |
+
|
152 |
+
from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
|
153 |
+
from funcs.representation_model import create_representation_model, found_file, gpu_config, chosen_start_tag
|
154 |
+
|
155 |
+
print("Create LLM topic labels:", create_llm_topic_labels)
|
156 |
+
representation_model = create_representation_model(create_llm_topic_labels, gpu_config, found_file, chosen_start_tag)
|
157 |
|
158 |
if not candidate_topics:
|
159 |
+
topic_model = BERTopic( embedding_model=embedding_model_pipe,
|
160 |
#hdbscan_model=cluster_model,
|
161 |
vectorizer_model=vectoriser_model,
|
162 |
min_topic_size= min_docs_slider,
|
|
|
169 |
|
170 |
# Do this if you have pre-assigned topics
|
171 |
else:
|
172 |
+
zero_shot_topics = read_file(candidate_topics.name)
|
173 |
+
#print(zero_shot_topics)
|
174 |
+
zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
|
175 |
|
176 |
+
print(zero_shot_topics_lower)
|
177 |
|
178 |
+
topic_model = BERTopic( embedding_model=embedding_model_pipe,
|
179 |
#hdbscan_model=cluster_model,
|
180 |
vectorizer_model=vectoriser_model,
|
181 |
min_topic_size = min_docs_slider,
|
182 |
nr_topics = max_topics_slider,
|
183 |
+
zeroshot_topic_list = zero_shot_topics_lower,
|
184 |
zeroshot_min_similarity = 0.7,
|
185 |
representation_model=representation_model,
|
186 |
verbose = True)
|
|
|
188 |
topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
|
189 |
|
190 |
if not topics_text:
|
191 |
+
return "No topics found, original file returned", data_file_name, None
|
192 |
|
193 |
else:
|
194 |
+
print("Preparing topic model outputs.")
|
|
|
|
|
|
|
195 |
|
196 |
topic_dets = topic_model.get_topic_info()
|
197 |
+
#print(topic_dets.columns)
|
198 |
|
199 |
+
if topic_dets.shape[0] == 1:
|
200 |
+
topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
|
201 |
+
topic_dets.to_csv(topic_det_output_name)
|
202 |
|
203 |
+
return "No topics found, original file returned", [data_file_name, topic_det_output_name], None
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
|
205 |
+
# Replace original labels with LLM labels
|
206 |
+
if "Mistral" in topic_model.get_topic_info().columns:
|
207 |
+
llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["Mistral"].values()]
|
208 |
+
topic_model.set_topic_labels(llm_labels)
|
209 |
+
else:
|
210 |
+
topic_model.set_topic_labels(list(topic_dets["Name"]))
|
211 |
|
212 |
# Outputs
|
213 |
+
|
214 |
+
topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
|
215 |
+
topic_dets.to_csv(topic_det_output_name)
|
216 |
+
|
217 |
+
doc_det_output_name = "doc_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
|
218 |
+
doc_dets = topic_model.get_document_info(docs)[["Document", "Topic", "Name", "Representative_document"]] # "Probability",
|
219 |
+
doc_dets.to_csv(doc_det_output_name)
|
220 |
+
|
221 |
+
topics_text_out_str = str(topic_dets["Name"])
|
222 |
+
output_text = "Topics: " + topics_text_out_str
|
223 |
+
|
224 |
embedding_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
|
225 |
np.savez_compressed(embedding_file_name, embeddings_out)
|
226 |
|
227 |
+
#if low_resource_mode == "No":
|
228 |
+
topic_model_save_name_folder = "output_model/" + data_file_name_no_ext + "_topics_" + today_rev# + ".safetensors"
|
229 |
+
topic_model_save_name_zip = topic_model_save_name_folder + ".zip"
|
230 |
+
|
231 |
+
# Clear folder before replacing files
|
232 |
+
delete_files_in_folder(topic_model_save_name_folder)
|
233 |
+
|
234 |
+
topic_model.save(topic_model_save_name_folder, serialization='safetensors', save_embedding_model=True, save_ctfidf=False)
|
235 |
+
|
236 |
+
# Zip file example
|
237 |
+
|
238 |
+
zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
|
239 |
|
240 |
# Visualise the topics:
|
241 |
topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
|
242 |
+
|
243 |
+
#return output_text, [doc_det_output_name, topic_det_output_name, embedding_file_name, topic_model_save_name_zip], topics_vis
|
244 |
+
|
245 |
+
#elif low_resource_mode == "Yes":
|
246 |
+
# # Visualise the topics:
|
247 |
+
# topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
|
248 |
|
249 |
+
# return output_text, [doc_det_output_name, topic_det_output_name, embedding_file_name], topics_vis
|
250 |
+
|
251 |
+
return output_text, [doc_det_output_name, topic_det_output_name, embedding_file_name, topic_model_save_name_zip], topics_vis
|
252 |
|
253 |
+
# , topic_model_save_name
|
254 |
|
255 |
# ## Gradio app - extract topics
|
256 |
|
|
|
280 |
candidate_topics = gr.File(label="Input topics from file (csv)")
|
281 |
|
282 |
with gr.Row():
|
283 |
+
min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of documents needed to create topic")
|
284 |
max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 3, step = 1, label = "Maximum number of topics")
|
285 |
|
286 |
with gr.Row():
|
|
|
294 |
|
295 |
with gr.Tab("Load and data processing options"):
|
296 |
with gr.Accordion("Process data on load", open = True):
|
297 |
+
with gr.Row():
|
298 |
+
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
|
299 |
+
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
|
300 |
+
embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
|
301 |
+
with gr.Row():
|
302 |
+
low_resource_mode_opt = gr.Dropdown(label = "Low resource mode (non-AI embeddings, no LLM-generated topic names).", value="No", choices=["Yes", "No"])
|
303 |
+
create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
|
304 |
|
305 |
# Update column names dropdown when file uploaded
|
306 |
in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])
|
307 |
in_colnames.change(dummy_function, in_colnames, None)
|
308 |
|
309 |
+
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, create_llm_topic_labels], outputs=[output_single_text, output_file, plot], api_name="topics")
|
310 |
|
311 |
block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
|
312 |
|
funcs/embeddings.py
CHANGED
@@ -11,7 +11,7 @@ if cuda.is_available():
|
|
11 |
else:
|
12 |
torch_device = "cpu"
|
13 |
|
14 |
-
def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt):
|
15 |
|
16 |
embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
|
17 |
|
@@ -38,11 +38,19 @@ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_mo
|
|
38 |
TruncatedSVD(100)
|
39 |
)
|
40 |
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
elif low_resource_mode_opt == "No":
|
44 |
print("Creating dense embeddings based on transformers model")
|
45 |
|
|
|
|
|
46 |
embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
|
47 |
|
48 |
#import torch
|
@@ -72,7 +80,8 @@ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_mo
|
|
72 |
np.savez_compressed(semantic_search_file_name, embeddings_out_round)
|
73 |
|
74 |
# Pre-reduce embeddings for visualisation purposes
|
75 |
-
|
76 |
-
|
|
|
77 |
|
78 |
-
return embeddings_out,
|
|
|
11 |
else:
|
12 |
torch_device = "cpu"
|
13 |
|
14 |
+
def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt, reduce_embeddings="Yes"):
|
15 |
|
16 |
embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
|
17 |
|
|
|
38 |
TruncatedSVD(100)
|
39 |
)
|
40 |
|
41 |
+
# Fit the pipeline to the text data
|
42 |
+
embedding_model.fit(docs)
|
43 |
+
|
44 |
+
# Transform text data to embeddings
|
45 |
+
embeddings_out = embedding_model.transform(docs)
|
46 |
+
|
47 |
+
#embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32)
|
48 |
|
49 |
elif low_resource_mode_opt == "No":
|
50 |
print("Creating dense embeddings based on transformers model")
|
51 |
|
52 |
+
#print("Embedding model is: ", embedding_model)
|
53 |
+
|
54 |
embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
|
55 |
|
56 |
#import torch
|
|
|
80 |
np.savez_compressed(semantic_search_file_name, embeddings_out_round)
|
81 |
|
82 |
# Pre-reduce embeddings for visualisation purposes
|
83 |
+
if reduce_embeddings == "Yes":
|
84 |
+
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings_out)
|
85 |
+
return embeddings_out, reduced_embeddings
|
86 |
|
87 |
+
return embeddings_out, None
|
funcs/helper_functions.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import os
|
|
|
2 |
import re
|
3 |
import pandas as pd
|
4 |
import gradio as gr
|
@@ -87,3 +88,35 @@ def dummy_function(in_colnames):
|
|
87 |
A dummy function that exists just so that dropdown updates work correctly.
|
88 |
"""
|
89 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
import zipfile
|
3 |
import re
|
4 |
import pandas as pd
|
5 |
import gradio as gr
|
|
|
88 |
A dummy function that exists just so that dropdown updates work correctly.
|
89 |
"""
|
90 |
return None
|
91 |
+
|
92 |
+
# Zip the above to export file
|
93 |
+
|
94 |
+
|
95 |
+
def zip_folder(folder_path, output_zip_file):
|
96 |
+
# Create a ZipFile object in write mode
|
97 |
+
with zipfile.ZipFile(output_zip_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
98 |
+
# Walk through the directory
|
99 |
+
for root, dirs, files in os.walk(folder_path):
|
100 |
+
for file in files:
|
101 |
+
# Create a complete file path
|
102 |
+
file_path = os.path.join(root, file)
|
103 |
+
# Add file to the zip file
|
104 |
+
# The arcname argument sets the archive name, i.e., the name within the zip file
|
105 |
+
zipf.write(file_path, arcname=os.path.relpath(file_path, folder_path))
|
106 |
+
|
107 |
+
def delete_files_in_folder(folder_path):
|
108 |
+
# Check if the folder exists
|
109 |
+
if not os.path.exists(folder_path):
|
110 |
+
print(f"The folder {folder_path} does not exist.")
|
111 |
+
return
|
112 |
+
|
113 |
+
# Iterate over all files in the folder and remove each
|
114 |
+
for filename in os.listdir(folder_path):
|
115 |
+
file_path = os.path.join(folder_path, filename)
|
116 |
+
try:
|
117 |
+
if os.path.isfile(file_path) or os.path.islink(file_path):
|
118 |
+
os.unlink(file_path)
|
119 |
+
else:
|
120 |
+
print(f"Skipping {file_path} as it is a directory")
|
121 |
+
except Exception as e:
|
122 |
+
print(f"Failed to delete {file_path}. Reason: {e}")
|
funcs/prompts.py
CHANGED
@@ -37,7 +37,7 @@ ASSISTANT:Topic label:"""
|
|
37 |
|
38 |
capybara_prompt = capybara_example_prompt + capybara_main_prompt
|
39 |
|
40 |
-
print("Capybara prompt: ", capybara_prompt)
|
41 |
|
42 |
# System prompt describes information given to all conversations
|
43 |
open_hermes_start="<|im_start|>"
|
@@ -72,7 +72,7 @@ Topic label:
|
|
72 |
"""
|
73 |
open_hermes_prompt = open_hermes_system_prompt + open_hermes_example_prompt + open_hermes_main_prompt
|
74 |
|
75 |
-
print("Open Hermes prompt: ", open_hermes_prompt)
|
76 |
|
77 |
stablelm_start = "<|user|>"
|
78 |
stablelm_example_prompt = """<|user|>
|
@@ -103,4 +103,4 @@ Topic label:"""
|
|
103 |
|
104 |
stablelm_prompt = stablelm_example_prompt + stablelm_main_prompt
|
105 |
|
106 |
-
print("StableLM prompt: ", stablelm_prompt)
|
|
|
37 |
|
38 |
capybara_prompt = capybara_example_prompt + capybara_main_prompt
|
39 |
|
40 |
+
#print("Capybara prompt: ", capybara_prompt)
|
41 |
|
42 |
# System prompt describes information given to all conversations
|
43 |
open_hermes_start="<|im_start|>"
|
|
|
72 |
"""
|
73 |
open_hermes_prompt = open_hermes_system_prompt + open_hermes_example_prompt + open_hermes_main_prompt
|
74 |
|
75 |
+
#print("Open Hermes prompt: ", open_hermes_prompt)
|
76 |
|
77 |
stablelm_start = "<|user|>"
|
78 |
stablelm_example_prompt = """<|user|>
|
|
|
103 |
|
104 |
stablelm_prompt = stablelm_example_prompt + stablelm_main_prompt
|
105 |
|
106 |
+
#print("StableLM prompt: ", stablelm_prompt)
|
funcs/representation_model.py
CHANGED
@@ -9,8 +9,6 @@ import torch.cuda
|
|
9 |
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
|
10 |
from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
|
11 |
|
12 |
-
#from huggingface_hub import hf_hub_download
|
13 |
-
#hf_hub_download(repo_id='second-state/stablelm-2-zephyr-1.6b-GGUF', filename='stablelm-2-zephyr-1_6b-Q5_K_M.gguf')
|
14 |
|
15 |
hf_model_name = 'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF' # 'second-state/stablelm-2-zephyr-1.6b-GGUF'
|
16 |
hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
|
@@ -18,9 +16,9 @@ chosen_prompt = open_hermes_prompt # stablelm_prompt
|
|
18 |
chosen_start_tag = open_hermes_start # stablelm_start
|
19 |
|
20 |
# Find model file
|
21 |
-
def find_model_file(hf_model_name, hf_model_file):
|
22 |
-
hf_loc = os.environ["HF_HOME"]
|
23 |
-
hf_sub_loc = os.environ["HF_HOME"]
|
24 |
|
25 |
hf_model_name_path = hf_sub_loc + 'models--' + hf_model_name.replace("/","--")
|
26 |
|
@@ -36,16 +34,19 @@ def find_model_file(hf_model_name, hf_model_file):
|
|
36 |
folder_path = hf_model_name_path # Replace with your folder path
|
37 |
file_to_find = hf_model_file # Replace with the file name you're looking for
|
38 |
|
39 |
-
found_file = find_file(folder_path, file_to_find)
|
40 |
if found_file:
|
41 |
print(f"File found: {found_file}")
|
42 |
return found_file
|
43 |
else:
|
44 |
error = "File not found."
|
45 |
-
print(error)
|
46 |
-
|
|
|
|
|
|
|
47 |
|
48 |
-
found_file = find_model_file(hf_model_name, hf_model_file)
|
49 |
|
50 |
# Currently set n_gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
|
51 |
if torch.cuda.is_available():
|
@@ -57,7 +58,7 @@ else:
|
|
57 |
low_resource_mode = "Yes"
|
58 |
n_gpu_layers = 0
|
59 |
|
60 |
-
|
61 |
|
62 |
#print("Running on device:", torch_device)
|
63 |
n_threads = torch.get_num_threads()
|
@@ -140,32 +141,32 @@ gen_config = LLamacppGenerateConfig(
|
|
140 |
# KeyBERT
|
141 |
keybert = KeyBERTInspired()
|
142 |
|
143 |
-
|
144 |
-
# Use llama.cpp to load in model
|
145 |
-
llm = Llama(model_path=found_file, stop=chosen_start_tag, n_gpu_layers=n_gpu_layers, n_ctx=n_ctx) #**gpu_config.model_dump())#
|
146 |
-
#print(llm.n_gpu_layers)
|
147 |
-
llm_model = LlamaCPP(llm, prompt=chosen_prompt)#, **gen_config.model_dump())
|
148 |
-
|
149 |
-
# All representation models
|
150 |
-
representation_model = {
|
151 |
-
"KeyBERT": keybert,
|
152 |
-
"Mistral": llm_model
|
153 |
-
}
|
154 |
|
155 |
-
|
156 |
-
|
|
|
|
|
|
|
157 |
|
158 |
-
#
|
159 |
-
|
160 |
-
|
161 |
-
|
|
|
162 |
|
163 |
-
|
164 |
-
|
165 |
-
#mistral_hermes = TextGeneration(generator, prompt=open_hermes_prompt)
|
166 |
|
|
|
|
|
|
|
|
|
167 |
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
-
# MMR (is rubbish, don't use)
|
170 |
-
#mmr = MaximalMarginalRelevance(diversity=0.3)
|
171 |
|
|
|
9 |
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
|
10 |
from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
|
11 |
|
|
|
|
|
12 |
|
13 |
hf_model_name = 'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF' # 'second-state/stablelm-2-zephyr-1.6b-GGUF'
|
14 |
hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
|
|
|
16 |
chosen_start_tag = open_hermes_start # stablelm_start
|
17 |
|
18 |
# Find model file
|
19 |
+
def find_model_file(hf_model_name, hf_model_file, search_folder):
|
20 |
+
hf_loc = search_folder #os.environ["HF_HOME"]
|
21 |
+
hf_sub_loc = search_folder + "/hub/" #os.environ["HF_HOME"]
|
22 |
|
23 |
hf_model_name_path = hf_sub_loc + 'models--' + hf_model_name.replace("/","--")
|
24 |
|
|
|
34 |
folder_path = hf_model_name_path # Replace with your folder path
|
35 |
file_to_find = hf_model_file # Replace with the file name you're looking for
|
36 |
|
37 |
+
found_file = find_file(folder_path, file_to_find) # os.environ["HF_HOME"]
|
38 |
if found_file:
|
39 |
print(f"File found: {found_file}")
|
40 |
return found_file
|
41 |
else:
|
42 |
error = "File not found."
|
43 |
+
print(error, " Downloading model from hub")
|
44 |
+
from huggingface_hub import hf_hub_download
|
45 |
+
hf_hub_download(repo_id=hf_model_name, filename='phi-2-orange.Q5_K_M.gguf')
|
46 |
+
found_file = find_file(folder_path, file_to_find)
|
47 |
+
return found_file
|
48 |
|
49 |
+
found_file = find_model_file(hf_model_name, hf_model_file, os.environ["HF_HOME"])#".")
|
50 |
|
51 |
# Currently set n_gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
|
52 |
if torch.cuda.is_available():
|
|
|
58 |
low_resource_mode = "Yes"
|
59 |
n_gpu_layers = 0
|
60 |
|
61 |
+
low_resource_mode = "No" # Override for testing
|
62 |
|
63 |
#print("Running on device:", torch_device)
|
64 |
n_threads = torch.get_num_threads()
|
|
|
141 |
# KeyBERT
|
142 |
keybert = KeyBERTInspired()
|
143 |
|
144 |
+
def create_representation_model(create_llm_topic_labels, gpu_config, found_file, chosen_start_tag):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
+
if create_llm_topic_labels == "Yes":
|
147 |
+
# Use llama.cpp to load in model
|
148 |
+
llm = Llama(model_path=found_file, stop=chosen_start_tag, n_gpu_layers=gpu_config.n_gpu_layers, n_ctx=gpu_config.n_ctx) #**gpu_config.model_dump())#
|
149 |
+
#print(llm.n_gpu_layers)
|
150 |
+
llm_model = LlamaCPP(llm, prompt=chosen_prompt)#, **gen_config.model_dump())
|
151 |
|
152 |
+
# All representation models
|
153 |
+
representation_model = {
|
154 |
+
"KeyBERT": keybert,
|
155 |
+
"Mistral": llm_model
|
156 |
+
}
|
157 |
|
158 |
+
elif create_llm_topic_labels == "No":
|
159 |
+
representation_model = {"KeyBERT": keybert}
|
|
|
160 |
|
161 |
+
# Deprecated example using CTransformers. This package is not really used anymore
|
162 |
+
#model = AutoModelForCausalLM.from_pretrained('NousResearch/Nous-Capybara-7B-V1.9-GGUF', model_type='mistral', model_file='Capybara-7B-V1.9-Q5_K_M.gguf', hf=True, **vars(gpu_config))
|
163 |
+
#tokenizer = AutoTokenizer.from_pretrained("NousResearch/Nous-Capybara-7B-V1.9")
|
164 |
+
#generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
|
165 |
|
166 |
+
# Text generation with Llama 2
|
167 |
+
#mistral_capybara = TextGeneration(generator, prompt=capybara_prompt)
|
168 |
+
#mistral_hermes = TextGeneration(generator, prompt=open_hermes_prompt)
|
169 |
+
|
170 |
+
return representation_model
|
171 |
|
|
|
|
|
172 |
|