Sean-Case commited on
Commit
fac3624
1 Parent(s): aa3df37

Greatly increased low resource process dimensions for higher quality. Visualisations disabled by default to increase speed.

Browse files
Files changed (3) hide show
  1. app.py +3 -3
  2. funcs/embeddings.py +1 -1
  3. funcs/representation_model.py +1 -1
app.py CHANGED
@@ -295,7 +295,7 @@ with block:
295
  candidate_topics = gr.File(label="Input topics from file (csv). File should have at least one column with a header and topic keywords in cells below. Topics will be taken from the first column of the file. Currently not compatible with low-resource embeddings.")
296
 
297
  with gr.Row():
298
- min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of documents per topic (use ~3 for low resource mode).")
299
  max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 3, step = 1, label = "Maximum number of topics")
300
 
301
  with gr.Row():
@@ -305,7 +305,7 @@ with block:
305
  output_single_text = gr.Textbox(label="Output example (first example in dataset)")
306
  output_file = gr.File(label="Output file")
307
 
308
- plot = gr.Plot(label="Visualise your topics here:")
309
 
310
  with gr.Tab("Options"):
311
  with gr.Accordion("Data load and processing options", open = True):
@@ -317,7 +317,7 @@ with block:
317
  low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings and processing.", value="No", choices=["Yes", "No"])
318
  create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
319
  save_topic_model = gr.Dropdown(label = "Save topic model to file.", value="Yes", choices=["Yes", "No"])
320
- visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="Yes", choices=["Yes", "No"])
321
 
322
  # Update column names dropdown when file uploaded
323
  in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])
 
295
  candidate_topics = gr.File(label="Input topics from file (csv). File should have at least one column with a header and topic keywords in cells below. Topics will be taken from the first column of the file. Currently not compatible with low-resource embeddings.")
296
 
297
  with gr.Row():
298
+ min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of similar documents needed to make a topic.")
299
  max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 3, step = 1, label = "Maximum number of topics")
300
 
301
  with gr.Row():
 
305
  output_single_text = gr.Textbox(label="Output example (first example in dataset)")
306
  output_file = gr.File(label="Output file")
307
 
308
+ plot = gr.Plot(label="Visualise your topics here. Go to the 'Options' tab to enable.")
309
 
310
  with gr.Tab("Options"):
311
  with gr.Accordion("Data load and processing options", open = True):
 
317
  low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings and processing.", value="No", choices=["Yes", "No"])
318
  create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
319
  save_topic_model = gr.Dropdown(label = "Save topic model to file.", value="Yes", choices=["Yes", "No"])
320
+ visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="No", choices=["Yes", "No"])
321
 
322
  # Update column names dropdown when file uploaded
323
  in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])
funcs/embeddings.py CHANGED
@@ -35,7 +35,7 @@ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_mo
35
  print("Creating simplified 'sparse' embeddings based on TfIDF")
36
  embedding_model = make_pipeline(
37
  TfidfVectorizer(),
38
- TruncatedSVD(100, random_state=random_seed)
39
  )
40
 
41
  # Fit the pipeline to the text data
 
35
  print("Creating simplified 'sparse' embeddings based on TfIDF")
36
  embedding_model = make_pipeline(
37
  TfidfVectorizer(),
38
+ TruncatedSVD(2000, random_state=random_seed)
39
  )
40
 
41
  # Fit the pipeline to the text data
funcs/representation_model.py CHANGED
@@ -119,7 +119,7 @@ llm_config = LLamacppInitConfigGpu(last_n_tokens_size=last_n_tokens_size,
119
  # KeyBERT
120
  keybert = KeyBERTInspired(random_state=random_seed)
121
  # MMR
122
- mmr = MaximalMarginalRelevance(diversity=0.3)
123
 
124
  def create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode):
125
 
 
119
  # KeyBERT
120
  keybert = KeyBERTInspired(random_state=random_seed)
121
  # MMR
122
+ mmr = MaximalMarginalRelevance(diversity=0.2)
123
 
124
  def create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode):
125