Sean-Case commited on
Commit
43ac0d8
1 Parent(s): fac3624

Returned TruncatedSVD components to 100 - higher values don't seem to help

Browse files
Files changed (2) hide show
  1. app.py +11 -8
  2. funcs/embeddings.py +2 -1
app.py CHANGED
@@ -128,21 +128,20 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
128
  umap_model = UMAP(n_neighbors=15, n_components=5, random_state=random_seed)
129
 
130
  elif low_resource_mode == "Yes":
131
- print("Choosing low resource TF-IDF model")
 
132
  embedding_model_pipe = make_pipeline(
133
  TfidfVectorizer(),
134
  TruncatedSVD(100) # 100 # To be compatible with zero shot, this needs to be lower than number of suggested topics
135
  )
136
  embedding_model = embedding_model_pipe
137
 
138
- umap_model = TruncatedSVD(n_components=3, random_state=random_seed)
139
-
140
-
141
 
142
  embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
143
 
144
 
145
- vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
146
 
147
  from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
148
  from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
@@ -241,10 +240,14 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
241
 
242
  if return_intermediate_files == "Yes":
243
  print("Saving embeddings to file")
244
- semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
245
- np.savez_compressed(semantic_search_file_name, embeddings_out)
 
 
 
 
246
 
247
- output_list.append(semantic_search_file_name)
248
 
249
  if visualise_topics == "Yes":
250
  # Visualise the topics:
 
128
  umap_model = UMAP(n_neighbors=15, n_components=5, random_state=random_seed)
129
 
130
  elif low_resource_mode == "Yes":
131
+ print("Choosing low resource TF-IDF model.")
132
+
133
  embedding_model_pipe = make_pipeline(
134
  TfidfVectorizer(),
135
  TruncatedSVD(100) # 100 # To be compatible with zero shot, this needs to be lower than number of suggested topics
136
  )
137
  embedding_model = embedding_model_pipe
138
 
139
+ umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
 
 
140
 
141
  embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
142
 
143
 
144
+ vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.05, max_df=0.9)
145
 
146
  from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
147
  from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
 
240
 
241
  if return_intermediate_files == "Yes":
242
  print("Saving embeddings to file")
243
+ if low_resource_mode == "Yes":
244
+ embeddings_file_name = data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
245
+ else:
246
+ embeddings_file_name = data_file_name_no_ext + '_' + 'ai_embeddings.npz'
247
+
248
+ np.savez_compressed(embeddings_file_name, embeddings_out)
249
 
250
+ output_list.append(embeddings_file_name)
251
 
252
  if visualise_topics == "Yes":
253
  # Visualise the topics:
funcs/embeddings.py CHANGED
@@ -33,9 +33,10 @@ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_mo
33
  # If on CPU, don't resort to embedding models
34
  if low_resource_mode_opt == "Yes":
35
  print("Creating simplified 'sparse' embeddings based on TfIDF")
 
36
  embedding_model = make_pipeline(
37
  TfidfVectorizer(),
38
- TruncatedSVD(2000, random_state=random_seed)
39
  )
40
 
41
  # Fit the pipeline to the text data
 
33
  # If on CPU, don't resort to embedding models
34
  if low_resource_mode_opt == "Yes":
35
  print("Creating simplified 'sparse' embeddings based on TfIDF")
36
+
37
  embedding_model = make_pipeline(
38
  TfidfVectorizer(),
39
+ TruncatedSVD(100, random_state=random_seed)
40
  )
41
 
42
  # Fit the pipeline to the text data