Sean-Case commited on
Commit
4cfed8e
1 Parent(s): 72f2310

Model export changed to safetensors. Improved representational model function. Got zero shot topic modelling working

Browse files
.gitignore CHANGED
@@ -4,6 +4,10 @@
4
  *.csv
5
  *.pkl
6
  *.parquet
 
 
 
7
  .ipynb_checkpoints/*
8
  old_code/*
9
- model/*
 
 
4
  *.csv
5
  *.pkl
6
  *.parquet
7
+ *.png
8
+ *.safetensors
9
+ *.json
10
  .ipynb_checkpoints/*
11
  old_code/*
12
+ model/*
13
+ output_model/*
app.py CHANGED
@@ -1,17 +1,14 @@
1
- import os
2
-
3
- #os.environ["TOKENIZERS_PARALLELISM"] = "true"
4
- #os.environ["HF_HOME"] = "/mnt/c/..."
5
- #os.environ["CUDA_PATH"] = "/mnt/c/..."
6
- #print(os.environ["HF_HOME"])
7
-
8
  import gradio as gr
9
  from datetime import datetime
10
  import pandas as pd
11
  import numpy as np
12
  from sklearn.cluster import KMeans
13
  from sklearn.feature_extraction.text import CountVectorizer
14
- from transformers import AutoModel
 
 
 
 
15
  import funcs.anonymiser as anon
16
 
17
  from torch import cuda, backends, version
@@ -43,10 +40,11 @@ from bertopic import BERTopic
43
  today = datetime.now().strftime("%d%m%Y")
44
  today_rev = datetime.now().strftime("%Y%m%d")
45
 
46
- from funcs.helper_functions import dummy_function, put_columns_in_df, read_file, get_file_path_end
47
- from funcs.representation_model import representation_model
48
  from funcs.embeddings import make_or_load_embeddings
49
 
 
50
  # Load embeddings
51
  #embedding_model_name = "BAAI/bge-small-en-v1.5"
52
  #embedding_model = SentenceTransformer(embedding_model_name)
@@ -57,14 +55,24 @@ embeddings_name = "jinaai/jina-embeddings-v2-small-en"
57
  local_embeddings_location = "model/jina/"
58
  revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
59
 
60
- try:
61
- embedding_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
62
- except:
63
- embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
 
 
 
 
 
64
 
 
 
 
 
 
65
 
66
 
67
- def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt):
68
 
69
  file_list = [string.name for string in in_file]
70
 
@@ -91,7 +99,28 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
91
  ## Load in pre-embedded file if exists
92
  file_list = [string.name for string in in_file]
93
 
94
- embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  # all_lengths = [len(embedding) for embedding in embeddings_out]
97
  # if len(set(all_lengths)) > 1:
@@ -110,18 +139,24 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
110
  #cluster_model = KMeans(n_clusters=max_topics_slider)
111
 
112
  # Countvectoriser removes stopwords, combines terms up to 2 together:
113
- if min_docs_slider < 3:
114
- min_df_val = min_docs_slider
115
- else:
116
- min_df_val = 3
117
 
118
- print(min_df_val)
119
 
120
  vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
121
 
 
 
 
 
 
 
122
 
123
  if not candidate_topics:
124
- topic_model = BERTopic( embedding_model=embedding_model,
125
  #hdbscan_model=cluster_model,
126
  vectorizer_model=vectoriser_model,
127
  min_topic_size= min_docs_slider,
@@ -134,17 +169,18 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
134
 
135
  # Do this if you have pre-assigned topics
136
  else:
137
- zero_shot_topics_list = read_file(candidate_topics.name)
138
- zero_shot_topics_list_lower = [x.lower() for x in zero_shot_topics_list]
 
139
 
140
- print(zero_shot_topics_list_lower)
141
 
142
- topic_model = BERTopic( embedding_model=embedding_model,
143
  #hdbscan_model=cluster_model,
144
  vectorizer_model=vectoriser_model,
145
  min_topic_size = min_docs_slider,
146
  nr_topics = max_topics_slider,
147
- zeroshot_topic_list = zero_shot_topics_list_lower,
148
  zeroshot_min_similarity = 0.7,
149
  representation_model=representation_model,
150
  verbose = True)
@@ -152,44 +188,69 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
152
  topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
153
 
154
  if not topics_text:
155
- return "No topics found, original file returned", data_file_name
156
 
157
  else:
158
- topics_text_out = topics_text
159
- topics_scores_out = probs
160
-
161
- topic_det_output_name = "topic_details_" + today_rev + ".csv"
162
 
163
  topic_dets = topic_model.get_topic_info()
 
164
 
165
- topic_dets.to_csv(topic_det_output_name)
166
- #print(topic_dets)
 
167
 
168
- doc_det_output_name = "doc_details_" + today_rev + ".csv"
169
- doc_dets = topic_model.get_document_info(docs)[["Document", "Topic", "Probability", "Name", "Representative_document"]]
170
- doc_dets.to_csv(doc_det_output_name)
171
- #print(doc_dets)
172
-
173
- #print(topic_dets)
174
- #topics_text_out_str = ', '.join(list(topic_dets["KeyBERT"]))
175
 
176
- topics_text_out_str = str(topic_dets["KeyBERT"])
177
- #topics_scores_out_str = str(doc_dets["Probability"][0])
178
-
179
- output_text = "Topics: " + topics_text_out_str #+ "\n\nProbability scores: " + topics_scores_out_str
 
 
180
 
181
  # Outputs
 
 
 
 
 
 
 
 
 
 
 
182
  embedding_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
183
  np.savez_compressed(embedding_file_name, embeddings_out)
184
 
185
- topic_model_save_name = data_file_name_no_ext + "_topics_" + today_rev + ".pkl"
186
- topic_model.save(topic_model_save_name, serialization='pickle', save_embedding_model=False, save_ctfidf=False)
 
 
 
 
 
 
 
 
 
 
187
 
188
  # Visualise the topics:
189
  topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
 
 
 
 
 
 
190
 
191
- return output_text, [doc_det_output_name, topic_det_output_name, embedding_file_name, topic_model_save_name], topics_vis
 
 
192
 
 
193
 
194
  # ## Gradio app - extract topics
195
 
@@ -219,7 +280,7 @@ with block:
219
  candidate_topics = gr.File(label="Input topics from file (csv)")
220
 
221
  with gr.Row():
222
- min_docs_slider = gr.Slider(minimum = 1, maximum = 1000, value = 15, step = 1, label = "Minimum number of documents needed to create topic")
223
  max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 3, step = 1, label = "Maximum number of topics")
224
 
225
  with gr.Row():
@@ -233,17 +294,19 @@ with block:
233
 
234
  with gr.Tab("Load and data processing options"):
235
  with gr.Accordion("Process data on load", open = True):
236
- anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load.")
237
- return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
238
- embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
239
- low_resource_mode_opt = gr.Dropdown(label = "Low resource mode (non-AI embeddings, no LLM-generated topic names).", value=low_resource_mode, choices=["Yes", "No"])
240
-
 
 
241
 
242
  # Update column names dropdown when file uploaded
243
  in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])
244
  in_colnames.change(dummy_function, in_colnames, None)
245
 
246
- topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt], outputs=[output_single_text, output_file, plot], api_name="topics")
247
 
248
  block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
249
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  from datetime import datetime
3
  import pandas as pd
4
  import numpy as np
5
  from sklearn.cluster import KMeans
6
  from sklearn.feature_extraction.text import CountVectorizer
7
+ from transformers import AutoModel, AutoTokenizer
8
+ from transformers.pipelines import pipeline
9
+ from sklearn.pipeline import make_pipeline
10
+ from sklearn.decomposition import TruncatedSVD
11
+ from sklearn.feature_extraction.text import TfidfVectorizer
12
  import funcs.anonymiser as anon
13
 
14
  from torch import cuda, backends, version
 
40
  today = datetime.now().strftime("%d%m%Y")
41
  today_rev = datetime.now().strftime("%Y%m%d")
42
 
43
+ from funcs.helper_functions import dummy_function, put_columns_in_df, read_file, get_file_path_end, zip_folder, delete_files_in_folder
44
+ #from funcs.representation_model import representation_model
45
  from funcs.embeddings import make_or_load_embeddings
46
 
47
+
48
  # Load embeddings
49
  #embedding_model_name = "BAAI/bge-small-en-v1.5"
50
  #embedding_model = SentenceTransformer(embedding_model_name)
 
55
  local_embeddings_location = "model/jina/"
56
  revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
57
 
58
+ if low_resource_mode == "No":
59
+ try:
60
+ embedding_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
61
+ except:
62
+ embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
63
+
64
+ tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-small-en")
65
+
66
+ embedding_model_pipe = pipeline("feature-extraction", model=embedding_model, tokenizer=tokenizer)
67
 
68
+ elif low_resource_mode == "Yes":
69
+ embedding_model_pipe = make_pipeline(
70
+ TfidfVectorizer(),
71
+ TruncatedSVD(2) # 100 # set to 2 to be compatible with zero shot topics - can't be higher than number of topics
72
+ )
73
 
74
 
75
+ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels):
76
 
77
  file_list = [string.name for string in in_file]
78
 
 
99
  ## Load in pre-embedded file if exists
100
  file_list = [string.name for string in in_file]
101
 
102
+ print("Low resource mode: ", low_resource_mode)
103
+
104
+ if low_resource_mode == "No":
105
+ print("Choosing high resource Jina transformer model")
106
+ try:
107
+ embedding_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
108
+ except:
109
+ embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
110
+
111
+ tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-small-en")
112
+
113
+ embedding_model_pipe = pipeline("feature-extraction", model=embedding_model, tokenizer=tokenizer)
114
+
115
+ elif low_resource_mode == "Yes":
116
+ print("Choosing low resource TfIDF model")
117
+ embedding_model_pipe = make_pipeline(
118
+ TfidfVectorizer(),
119
+ TruncatedSVD(2) # 100 # To be compatible with zero shot, this needs to be lower than number of suggested topics
120
+ )
121
+ embedding_model = embedding_model_pipe
122
+
123
+ embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
124
 
125
  # all_lengths = [len(embedding) for embedding in embeddings_out]
126
  # if len(set(all_lengths)) > 1:
 
139
  #cluster_model = KMeans(n_clusters=max_topics_slider)
140
 
141
  # Countvectoriser removes stopwords, combines terms up to 2 together:
142
+ #if min_docs_slider < 3:
143
+ # min_df_val = min_docs_slider
144
+ #else:
145
+ # min_df_val = 3
146
 
147
+ #print(min_df_val)
148
 
149
  vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
150
 
151
+
152
+ from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
153
+ from funcs.representation_model import create_representation_model, found_file, gpu_config, chosen_start_tag
154
+
155
+ print("Create LLM topic labels:", create_llm_topic_labels)
156
+ representation_model = create_representation_model(create_llm_topic_labels, gpu_config, found_file, chosen_start_tag)
157
 
158
  if not candidate_topics:
159
+ topic_model = BERTopic( embedding_model=embedding_model_pipe,
160
  #hdbscan_model=cluster_model,
161
  vectorizer_model=vectoriser_model,
162
  min_topic_size= min_docs_slider,
 
169
 
170
  # Do this if you have pre-assigned topics
171
  else:
172
+ zero_shot_topics = read_file(candidate_topics.name)
173
+ #print(zero_shot_topics)
174
+ zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
175
 
176
+ print(zero_shot_topics_lower)
177
 
178
+ topic_model = BERTopic( embedding_model=embedding_model_pipe,
179
  #hdbscan_model=cluster_model,
180
  vectorizer_model=vectoriser_model,
181
  min_topic_size = min_docs_slider,
182
  nr_topics = max_topics_slider,
183
+ zeroshot_topic_list = zero_shot_topics_lower,
184
  zeroshot_min_similarity = 0.7,
185
  representation_model=representation_model,
186
  verbose = True)
 
188
  topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
189
 
190
  if not topics_text:
191
+ return "No topics found, original file returned", data_file_name, None
192
 
193
  else:
194
+ print("Preparing topic model outputs.")
 
 
 
195
 
196
  topic_dets = topic_model.get_topic_info()
197
+ #print(topic_dets.columns)
198
 
199
+ if topic_dets.shape[0] == 1:
200
+ topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
201
+ topic_dets.to_csv(topic_det_output_name)
202
 
203
+ return "No topics found, original file returned", [data_file_name, topic_det_output_name], None
 
 
 
 
 
 
204
 
205
+ # Replace original labels with LLM labels
206
+ if "Mistral" in topic_model.get_topic_info().columns:
207
+ llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["Mistral"].values()]
208
+ topic_model.set_topic_labels(llm_labels)
209
+ else:
210
+ topic_model.set_topic_labels(list(topic_dets["Name"]))
211
 
212
  # Outputs
213
+
214
+ topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
215
+ topic_dets.to_csv(topic_det_output_name)
216
+
217
+ doc_det_output_name = "doc_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
218
+ doc_dets = topic_model.get_document_info(docs)[["Document", "Topic", "Name", "Representative_document"]] # "Probability",
219
+ doc_dets.to_csv(doc_det_output_name)
220
+
221
+ topics_text_out_str = str(topic_dets["Name"])
222
+ output_text = "Topics: " + topics_text_out_str
223
+
224
  embedding_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
225
  np.savez_compressed(embedding_file_name, embeddings_out)
226
 
227
+ #if low_resource_mode == "No":
228
+ topic_model_save_name_folder = "output_model/" + data_file_name_no_ext + "_topics_" + today_rev# + ".safetensors"
229
+ topic_model_save_name_zip = topic_model_save_name_folder + ".zip"
230
+
231
+ # Clear folder before replacing files
232
+ delete_files_in_folder(topic_model_save_name_folder)
233
+
234
+ topic_model.save(topic_model_save_name_folder, serialization='safetensors', save_embedding_model=True, save_ctfidf=False)
235
+
236
+ # Zip file example
237
+
238
+ zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
239
 
240
  # Visualise the topics:
241
  topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
242
+
243
+ #return output_text, [doc_det_output_name, topic_det_output_name, embedding_file_name, topic_model_save_name_zip], topics_vis
244
+
245
+ #elif low_resource_mode == "Yes":
246
+ # # Visualise the topics:
247
+ # topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
248
 
249
+ # return output_text, [doc_det_output_name, topic_det_output_name, embedding_file_name], topics_vis
250
+
251
+ return output_text, [doc_det_output_name, topic_det_output_name, embedding_file_name, topic_model_save_name_zip], topics_vis
252
 
253
+ # , topic_model_save_name
254
 
255
  # ## Gradio app - extract topics
256
 
 
280
  candidate_topics = gr.File(label="Input topics from file (csv)")
281
 
282
  with gr.Row():
283
+ min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of documents needed to create topic")
284
  max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 3, step = 1, label = "Maximum number of topics")
285
 
286
  with gr.Row():
 
294
 
295
  with gr.Tab("Load and data processing options"):
296
  with gr.Accordion("Process data on load", open = True):
297
+ with gr.Row():
298
+ anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
299
+ return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="No", choices=["Yes", "No"])
300
+ embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
301
+ with gr.Row():
302
+ low_resource_mode_opt = gr.Dropdown(label = "Low resource mode (non-AI embeddings, no LLM-generated topic names).", value="No", choices=["Yes", "No"])
303
+ create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
304
 
305
  # Update column names dropdown when file uploaded
306
  in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])
307
  in_colnames.change(dummy_function, in_colnames, None)
308
 
309
+ topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, create_llm_topic_labels], outputs=[output_single_text, output_file, plot], api_name="topics")
310
 
311
  block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
312
 
funcs/embeddings.py CHANGED
@@ -11,7 +11,7 @@ if cuda.is_available():
11
  else:
12
  torch_device = "cpu"
13
 
14
- def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt):
15
 
16
  embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
17
 
@@ -38,11 +38,19 @@ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_mo
38
  TruncatedSVD(100)
39
  )
40
 
41
- embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32)
 
 
 
 
 
 
42
 
43
  elif low_resource_mode_opt == "No":
44
  print("Creating dense embeddings based on transformers model")
45
 
 
 
46
  embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
47
 
48
  #import torch
@@ -72,7 +80,8 @@ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_mo
72
  np.savez_compressed(semantic_search_file_name, embeddings_out_round)
73
 
74
  # Pre-reduce embeddings for visualisation purposes
75
- reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings_out)
76
-
 
77
 
78
- return embeddings_out, reduced_embeddings
 
11
  else:
12
  torch_device = "cpu"
13
 
14
+ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt, reduce_embeddings="Yes"):
15
 
16
  embeddings_file_names = [string.lower() for string in file_list if "embedding" in string.lower()]
17
 
 
38
  TruncatedSVD(100)
39
  )
40
 
41
+ # Fit the pipeline to the text data
42
+ embedding_model.fit(docs)
43
+
44
+ # Transform text data to embeddings
45
+ embeddings_out = embedding_model.transform(docs)
46
+
47
+ #embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32)
48
 
49
  elif low_resource_mode_opt == "No":
50
  print("Creating dense embeddings based on transformers model")
51
 
52
+ #print("Embedding model is: ", embedding_model)
53
+
54
  embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
55
 
56
  #import torch
 
80
  np.savez_compressed(semantic_search_file_name, embeddings_out_round)
81
 
82
  # Pre-reduce embeddings for visualisation purposes
83
+ if reduce_embeddings == "Yes":
84
+ reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings_out)
85
+ return embeddings_out, reduced_embeddings
86
 
87
+ return embeddings_out, None
funcs/helper_functions.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  import re
3
  import pandas as pd
4
  import gradio as gr
@@ -87,3 +88,35 @@ def dummy_function(in_colnames):
87
  A dummy function that exists just so that dropdown updates work correctly.
88
  """
89
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import zipfile
3
  import re
4
  import pandas as pd
5
  import gradio as gr
 
88
  A dummy function that exists just so that dropdown updates work correctly.
89
  """
90
  return None
91
+
92
+ # Zip the above to export file
93
+
94
+
95
+ def zip_folder(folder_path, output_zip_file):
96
+ # Create a ZipFile object in write mode
97
+ with zipfile.ZipFile(output_zip_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
98
+ # Walk through the directory
99
+ for root, dirs, files in os.walk(folder_path):
100
+ for file in files:
101
+ # Create a complete file path
102
+ file_path = os.path.join(root, file)
103
+ # Add file to the zip file
104
+ # The arcname argument sets the archive name, i.e., the name within the zip file
105
+ zipf.write(file_path, arcname=os.path.relpath(file_path, folder_path))
106
+
107
+ def delete_files_in_folder(folder_path):
108
+ # Check if the folder exists
109
+ if not os.path.exists(folder_path):
110
+ print(f"The folder {folder_path} does not exist.")
111
+ return
112
+
113
+ # Iterate over all files in the folder and remove each
114
+ for filename in os.listdir(folder_path):
115
+ file_path = os.path.join(folder_path, filename)
116
+ try:
117
+ if os.path.isfile(file_path) or os.path.islink(file_path):
118
+ os.unlink(file_path)
119
+ else:
120
+ print(f"Skipping {file_path} as it is a directory")
121
+ except Exception as e:
122
+ print(f"Failed to delete {file_path}. Reason: {e}")
funcs/prompts.py CHANGED
@@ -37,7 +37,7 @@ ASSISTANT:Topic label:"""
37
 
38
  capybara_prompt = capybara_example_prompt + capybara_main_prompt
39
 
40
- print("Capybara prompt: ", capybara_prompt)
41
 
42
  # System prompt describes information given to all conversations
43
  open_hermes_start="<|im_start|>"
@@ -72,7 +72,7 @@ Topic label:
72
  """
73
  open_hermes_prompt = open_hermes_system_prompt + open_hermes_example_prompt + open_hermes_main_prompt
74
 
75
- print("Open Hermes prompt: ", open_hermes_prompt)
76
 
77
  stablelm_start = "<|user|>"
78
  stablelm_example_prompt = """<|user|>
@@ -103,4 +103,4 @@ Topic label:"""
103
 
104
  stablelm_prompt = stablelm_example_prompt + stablelm_main_prompt
105
 
106
- print("StableLM prompt: ", stablelm_prompt)
 
37
 
38
  capybara_prompt = capybara_example_prompt + capybara_main_prompt
39
 
40
+ #print("Capybara prompt: ", capybara_prompt)
41
 
42
  # System prompt describes information given to all conversations
43
  open_hermes_start="<|im_start|>"
 
72
  """
73
  open_hermes_prompt = open_hermes_system_prompt + open_hermes_example_prompt + open_hermes_main_prompt
74
 
75
+ #print("Open Hermes prompt: ", open_hermes_prompt)
76
 
77
  stablelm_start = "<|user|>"
78
  stablelm_example_prompt = """<|user|>
 
103
 
104
  stablelm_prompt = stablelm_example_prompt + stablelm_main_prompt
105
 
106
+ #print("StableLM prompt: ", stablelm_prompt)
funcs/representation_model.py CHANGED
@@ -9,8 +9,6 @@ import torch.cuda
9
  from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
10
  from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
11
 
12
- #from huggingface_hub import hf_hub_download
13
- #hf_hub_download(repo_id='second-state/stablelm-2-zephyr-1.6b-GGUF', filename='stablelm-2-zephyr-1_6b-Q5_K_M.gguf')
14
 
15
  hf_model_name = 'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF' # 'second-state/stablelm-2-zephyr-1.6b-GGUF'
16
  hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
@@ -18,9 +16,9 @@ chosen_prompt = open_hermes_prompt # stablelm_prompt
18
  chosen_start_tag = open_hermes_start # stablelm_start
19
 
20
  # Find model file
21
- def find_model_file(hf_model_name, hf_model_file):
22
- hf_loc = os.environ["HF_HOME"]
23
- hf_sub_loc = os.environ["HF_HOME"] + "/hub/"
24
 
25
  hf_model_name_path = hf_sub_loc + 'models--' + hf_model_name.replace("/","--")
26
 
@@ -36,16 +34,19 @@ def find_model_file(hf_model_name, hf_model_file):
36
  folder_path = hf_model_name_path # Replace with your folder path
37
  file_to_find = hf_model_file # Replace with the file name you're looking for
38
 
39
- found_file = find_file(folder_path, file_to_find)
40
  if found_file:
41
  print(f"File found: {found_file}")
42
  return found_file
43
  else:
44
  error = "File not found."
45
- print(error)
46
- return error
 
 
 
47
 
48
- found_file = find_model_file(hf_model_name, hf_model_file)
49
 
50
  # Currently set n_gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
51
  if torch.cuda.is_available():
@@ -57,7 +58,7 @@ else:
57
  low_resource_mode = "Yes"
58
  n_gpu_layers = 0
59
 
60
- #low_resource_mode = "Yes"
61
 
62
  #print("Running on device:", torch_device)
63
  n_threads = torch.get_num_threads()
@@ -140,32 +141,32 @@ gen_config = LLamacppGenerateConfig(
140
  # KeyBERT
141
  keybert = KeyBERTInspired()
142
 
143
- if low_resource_mode == "No":
144
- # Use llama.cpp to load in model
145
- llm = Llama(model_path=found_file, stop=chosen_start_tag, n_gpu_layers=n_gpu_layers, n_ctx=n_ctx) #**gpu_config.model_dump())#
146
- #print(llm.n_gpu_layers)
147
- llm_model = LlamaCPP(llm, prompt=chosen_prompt)#, **gen_config.model_dump())
148
-
149
- # All representation models
150
- representation_model = {
151
- "KeyBERT": keybert,
152
- "Mistral": llm_model
153
- }
154
 
155
- elif low_resource_mode == "Yes":
156
- representation_model = {"KeyBERT": keybert}
 
 
 
157
 
158
- # Deprecated example using CTransformers. This package is not really used anymore
159
- #model = AutoModelForCausalLM.from_pretrained('NousResearch/Nous-Capybara-7B-V1.9-GGUF', model_type='mistral', model_file='Capybara-7B-V1.9-Q5_K_M.gguf', hf=True, **vars(gpu_config))
160
- #tokenizer = AutoTokenizer.from_pretrained("NousResearch/Nous-Capybara-7B-V1.9")
161
- #generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
 
162
 
163
- # Text generation with Llama 2
164
- #mistral_capybara = TextGeneration(generator, prompt=capybara_prompt)
165
- #mistral_hermes = TextGeneration(generator, prompt=open_hermes_prompt)
166
 
 
 
 
 
167
 
 
 
 
 
 
168
 
169
- # MMR (is rubbish, don't use)
170
- #mmr = MaximalMarginalRelevance(diversity=0.3)
171
 
 
9
  from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
10
  from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
11
 
 
 
12
 
13
  hf_model_name = 'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF' # 'second-state/stablelm-2-zephyr-1.6b-GGUF'
14
  hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
 
16
  chosen_start_tag = open_hermes_start # stablelm_start
17
 
18
  # Find model file
19
+ def find_model_file(hf_model_name, hf_model_file, search_folder):
20
+ hf_loc = search_folder #os.environ["HF_HOME"]
21
+ hf_sub_loc = search_folder + "/hub/" #os.environ["HF_HOME"]
22
 
23
  hf_model_name_path = hf_sub_loc + 'models--' + hf_model_name.replace("/","--")
24
 
 
34
  folder_path = hf_model_name_path # Replace with your folder path
35
  file_to_find = hf_model_file # Replace with the file name you're looking for
36
 
37
+ found_file = find_file(folder_path, file_to_find) # os.environ["HF_HOME"]
38
  if found_file:
39
  print(f"File found: {found_file}")
40
  return found_file
41
  else:
42
  error = "File not found."
43
+ print(error, " Downloading model from hub")
44
+ from huggingface_hub import hf_hub_download
45
+ hf_hub_download(repo_id=hf_model_name, filename='phi-2-orange.Q5_K_M.gguf')
46
+ found_file = find_file(folder_path, file_to_find)
47
+ return found_file
48
 
49
+ found_file = find_model_file(hf_model_name, hf_model_file, os.environ["HF_HOME"])#".")
50
 
51
  # Currently set n_gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
52
  if torch.cuda.is_available():
 
58
  low_resource_mode = "Yes"
59
  n_gpu_layers = 0
60
 
61
+ low_resource_mode = "No" # Override for testing
62
 
63
  #print("Running on device:", torch_device)
64
  n_threads = torch.get_num_threads()
 
141
  # KeyBERT
142
  keybert = KeyBERTInspired()
143
 
144
+ def create_representation_model(create_llm_topic_labels, gpu_config, found_file, chosen_start_tag):
 
 
 
 
 
 
 
 
 
 
145
 
146
+ if create_llm_topic_labels == "Yes":
147
+ # Use llama.cpp to load in model
148
+ llm = Llama(model_path=found_file, stop=chosen_start_tag, n_gpu_layers=gpu_config.n_gpu_layers, n_ctx=gpu_config.n_ctx) #**gpu_config.model_dump())#
149
+ #print(llm.n_gpu_layers)
150
+ llm_model = LlamaCPP(llm, prompt=chosen_prompt)#, **gen_config.model_dump())
151
 
152
+ # All representation models
153
+ representation_model = {
154
+ "KeyBERT": keybert,
155
+ "Mistral": llm_model
156
+ }
157
 
158
+ elif create_llm_topic_labels == "No":
159
+ representation_model = {"KeyBERT": keybert}
 
160
 
161
+ # Deprecated example using CTransformers. This package is not really used anymore
162
+ #model = AutoModelForCausalLM.from_pretrained('NousResearch/Nous-Capybara-7B-V1.9-GGUF', model_type='mistral', model_file='Capybara-7B-V1.9-Q5_K_M.gguf', hf=True, **vars(gpu_config))
163
+ #tokenizer = AutoTokenizer.from_pretrained("NousResearch/Nous-Capybara-7B-V1.9")
164
+ #generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
165
 
166
+ # Text generation with Llama 2
167
+ #mistral_capybara = TextGeneration(generator, prompt=capybara_prompt)
168
+ #mistral_hermes = TextGeneration(generator, prompt=open_hermes_prompt)
169
+
170
+ return representation_model
171
 
 
 
172