seanpedrickcase commited on
Commit
f2d85f1
·
1 Parent(s): 125e31b

Improved on implementation of group-based analysis. Now should be possible all the way to summarisation

Browse files
app.py CHANGED
@@ -80,23 +80,18 @@ with app:
80
  # UI LAYOUT
81
  ###
82
 
83
- gr.Markdown(
84
- """# Large language model topic modelling
85
 
86
  Extract topics and summarise outputs using Large Language Models (LLMs, a Gemma model if local, Gemini Flash/Pro, or Claude 3 through AWS Bedrock if running on AWS). The app will query the LLM with batches of responses to produce summary tables, which are then compared iteratively to output a table with the general topics, subtopics, topic sentiment, and relevant text rows related to them. The prompts are designed for topic modelling public consultations, but they can be adapted to different contexts (see the LLM settings tab to modify).
87
 
88
  Instructions on use can be found in the README.md file. Try it out with this [dummy development consultation dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation), which you can also try with [zero-shot topics](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation/blob/main/example_zero_shot.csv), or this [dummy case notes dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_case_notes).
89
 
90
- You can use an AWS Bedrock model (Claude 3, paid), or Gemini (a free API, but with strict limits for the Pro model). The use of Gemini models requires an API key. To set up your own Gemini API key, go [here](https://aistudio.google.com/app/u/1/plan_information).
91
 
92
  NOTE: that **API calls to Gemini are not considered secure**, so please only submit redacted, non-sensitive tabular files to this source. Also, large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
93
 
94
  with gr.Tab(label="Extract topics"):
95
- gr.Markdown(
96
- """
97
- ### Choose a tabular data file (xlsx or csv) of open text to extract topics from.
98
- """
99
- )
100
  with gr.Row():
101
  model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
102
  in_api_key = gr.Textbox(value = GEMINI_API_KEY, label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
@@ -138,10 +133,7 @@ with app:
138
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
139
 
140
  with gr.Tab(label="Modify, deduplicate, and summarise topic outputs"):
141
- gr.Markdown(
142
- """
143
- Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to modify topics, deduplicate topics, or summarise the outputs. If you want pivot table outputs, please load in the original data file along with the selected open text column on the first tab before deduplicating or summarising.
144
- """)
145
 
146
  with gr.Accordion("Modify existing topics", open = False):
147
  modification_input_files = gr.File(height=file_input_height, label="Upload files to modify topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
@@ -184,19 +176,13 @@ with app:
184
  overall_summarised_output_markdown = gr.Markdown(value="### Overall summary will appear here", show_copy_button=True)
185
 
186
  with gr.Tab(label="Topic table viewer"):
187
- gr.Markdown(
188
- """
189
- ### View a 'unique_topic_table' csv file in markdown format.
190
- """)
191
 
192
  in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
193
  view_table_markdown = gr.Markdown(value = "", label="View table", show_copy_button=True)
194
 
195
  with gr.Tab(label="Continue unfinished topic extraction"):
196
- gr.Markdown(
197
- """
198
- ### Load in output files from a previous topic extraction process and continue topic extraction with new data.
199
- """)
200
 
201
  with gr.Accordion("Upload reference data file and unique data files", open = True):
202
  in_previous_data_files = gr.File(height=file_input_height, label="Choose output csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
@@ -204,11 +190,7 @@ with app:
204
  continue_previous_data_files_btn = gr.Button(value="Continue previous topic extraction", variant="primary")
205
 
206
  with gr.Tab(label="Verify descriptions"):
207
- gr.Markdown(
208
- """
209
- ### Choose a tabular data file (xlsx or csv) with titles and original text to verify descriptions for.
210
- """
211
- )
212
  with gr.Row():
213
  verify_model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
214
  verify_in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
@@ -227,10 +209,7 @@ with app:
227
  verify_modification_input_files_placeholder = gr.File(height=file_input_height, label="Placeholder for files to avoid errors", visible=False)
228
 
229
  with gr.Tab(label="Topic extraction settings"):
230
- gr.Markdown(
231
- """
232
- Define settings that affect large language model output.
233
- """)
234
  with gr.Accordion("Settings for LLM generation", open = True):
235
  temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Choose LLM temperature setting")
236
  batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = BATCH_SIZE_DEFAULT, precision=0, minimum=1, maximum=100)
@@ -296,32 +275,28 @@ with app:
296
  success(fn=wrapper_extract_topics_per_column_value,
297
  inputs=[in_group_col,
298
  in_data_files,
299
-
300
  file_data_state,
301
  master_topic_df_state,
302
  master_reference_df_state,
303
  master_unique_topics_df_state,
304
  display_topic_table_markdown,
305
  reference_data_file_name_textbox,
306
-
307
  total_number_of_batches,
308
  in_api_key,
309
  temperature_slide,
310
  in_colnames,
311
  model_choice,
312
  candidate_topics,
313
-
314
  first_loop_state,
315
  conversation_metadata_textbox,
316
  latest_batch_completed,
317
- estimated_time_taken_number,
318
-
319
  initial_table_prompt_textbox,
320
  prompt_2_textbox,
321
  prompt_3_textbox,
322
  system_prompt_textbox,
323
- add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox,
324
-
325
  number_of_prompts,
326
  batch_size_number,
327
  context_textbox,
@@ -371,14 +346,14 @@ with app:
371
  # When button pressed, summarise previous data
372
  summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, overall_summarisation_input_files]).\
373
  success(load_in_previous_data_files, inputs=[summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
374
- success(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state], api_name="sample_summaries").\
375
  success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], api_name="summarise_topics")
376
 
377
  latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], scroll_to_output=True)
378
 
379
  # SUMMARISE WHOLE TABLE PAGE
380
  overall_summarise_previous_data_btn.click(load_in_previous_data_files, inputs=[overall_summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
381
- success(overall_summary, inputs=[master_unique_topics_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, output_folder_state], outputs=[overall_summary_output_files, overall_summarised_output_markdown], scroll_to_output=True, api_name="overall_summary")
382
 
383
  ###
384
  # CONTINUE PREVIOUS TOPIC EXTRACTION PAGE
 
80
  # UI LAYOUT
81
  ###
82
 
83
+ gr.Markdown("""# Large language model topic modelling
 
84
 
85
  Extract topics and summarise outputs using Large Language Models (LLMs, a Gemma model if local, Gemini Flash/Pro, or Claude 3 through AWS Bedrock if running on AWS). The app will query the LLM with batches of responses to produce summary tables, which are then compared iteratively to output a table with the general topics, subtopics, topic sentiment, and relevant text rows related to them. The prompts are designed for topic modelling public consultations, but they can be adapted to different contexts (see the LLM settings tab to modify).
86
 
87
  Instructions on use can be found in the README.md file. Try it out with this [dummy development consultation dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation), which you can also try with [zero-shot topics](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation/blob/main/example_zero_shot.csv), or this [dummy case notes dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_case_notes).
88
 
89
+ You can use an AWS Bedrock model (Claude 3, paid), or Gemini (a free API, but with strict limits for the Pro model). The use of Gemini models requires an API key. To set up your own Gemini API key, go [here](https://aistudio.google.com/app/u/1/plan_information).
90
 
91
  NOTE: that **API calls to Gemini are not considered secure**, so please only submit redacted, non-sensitive tabular files to this source. Also, large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
92
 
93
  with gr.Tab(label="Extract topics"):
94
+ gr.Markdown("""### Choose a tabular data file (xlsx or csv) of open text to extract topics from.""")
 
 
 
 
95
  with gr.Row():
96
  model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
97
  in_api_key = gr.Textbox(value = GEMINI_API_KEY, label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
 
133
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
134
 
135
  with gr.Tab(label="Modify, deduplicate, and summarise topic outputs"):
136
+ gr.Markdown("""Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to modify topics, deduplicate topics, or summarise the outputs. If you want pivot table outputs, please load in the original data file along with the selected open text column on the first tab before deduplicating or summarising.""")
 
 
 
137
 
138
  with gr.Accordion("Modify existing topics", open = False):
139
  modification_input_files = gr.File(height=file_input_height, label="Upload files to modify topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
 
176
  overall_summarised_output_markdown = gr.Markdown(value="### Overall summary will appear here", show_copy_button=True)
177
 
178
  with gr.Tab(label="Topic table viewer"):
179
+ gr.Markdown("""### View a 'unique_topic_table' csv file in markdown format.""")
 
 
 
180
 
181
  in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
182
  view_table_markdown = gr.Markdown(value = "", label="View table", show_copy_button=True)
183
 
184
  with gr.Tab(label="Continue unfinished topic extraction"):
185
+ gr.Markdown("""### Load in output files from a previous topic extraction process and continue topic extraction with new data.""")
 
 
 
186
 
187
  with gr.Accordion("Upload reference data file and unique data files", open = True):
188
  in_previous_data_files = gr.File(height=file_input_height, label="Choose output csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
 
190
  continue_previous_data_files_btn = gr.Button(value="Continue previous topic extraction", variant="primary")
191
 
192
  with gr.Tab(label="Verify descriptions"):
193
+ gr.Markdown("""### Choose a tabular data file (xlsx or csv) with titles and original text to verify descriptions for.""")
 
 
 
 
194
  with gr.Row():
195
  verify_model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
196
  verify_in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
 
209
  verify_modification_input_files_placeholder = gr.File(height=file_input_height, label="Placeholder for files to avoid errors", visible=False)
210
 
211
  with gr.Tab(label="Topic extraction settings"):
212
+ gr.Markdown("""Define settings that affect large language model output.""")
 
 
 
213
  with gr.Accordion("Settings for LLM generation", open = True):
214
  temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Choose LLM temperature setting")
215
  batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = BATCH_SIZE_DEFAULT, precision=0, minimum=1, maximum=100)
 
275
  success(fn=wrapper_extract_topics_per_column_value,
276
  inputs=[in_group_col,
277
  in_data_files,
 
278
  file_data_state,
279
  master_topic_df_state,
280
  master_reference_df_state,
281
  master_unique_topics_df_state,
282
  display_topic_table_markdown,
283
  reference_data_file_name_textbox,
 
284
  total_number_of_batches,
285
  in_api_key,
286
  temperature_slide,
287
  in_colnames,
288
  model_choice,
289
  candidate_topics,
 
290
  first_loop_state,
291
  conversation_metadata_textbox,
292
  latest_batch_completed,
293
+ estimated_time_taken_number,
 
294
  initial_table_prompt_textbox,
295
  prompt_2_textbox,
296
  prompt_3_textbox,
297
  system_prompt_textbox,
298
+ add_to_existing_topics_system_prompt_textbox,
299
+ add_to_existing_topics_prompt_textbox,
300
  number_of_prompts,
301
  batch_size_number,
302
  context_textbox,
 
346
  # When button pressed, summarise previous data
347
  summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, overall_summarisation_input_files]).\
348
  success(load_in_previous_data_files, inputs=[summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
349
+ success(sample_reference_table_summaries, inputs=[master_reference_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown], api_name="sample_summaries").\
350
  success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], api_name="summarise_topics")
351
 
352
  latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], scroll_to_output=True)
353
 
354
  # SUMMARISE WHOLE TABLE PAGE
355
  overall_summarise_previous_data_btn.click(load_in_previous_data_files, inputs=[overall_summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
356
+ success(overall_summary, inputs=[master_unique_topics_df_state, model_choice, in_api_key, temperature_slide, unique_topics_table_file_name_textbox, summarised_outputs_list, output_folder_state], outputs=[overall_summary_output_files, overall_summarised_output_markdown], scroll_to_output=True, api_name="overall_summary")
357
 
358
  ###
359
  # CONTINUE PREVIOUS TOPIC EXTRACTION PAGE
tools/dedup_summaries.py CHANGED
@@ -9,7 +9,7 @@ from tqdm import tqdm
9
 
10
  from tools.prompts import summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, system_prompt, summarise_everything_prompt, comprehensive_summary_format_prompt
11
  from tools.llm_funcs import construct_gemini_generative_model, process_requests, ResponseObject, load_model
12
- from tools.helper_functions import create_topic_summary_df_from_reference_table, load_in_data_file, get_basic_response_data, convert_reference_table_to_pivot_table, wrap_text
13
  from tools.config import OUTPUT_FOLDER, RUN_LOCAL_MODEL, MAX_COMMENT_CHARS, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED
14
 
15
  max_tokens = MAX_TOKENS
@@ -158,17 +158,17 @@ def deduplicate_topics(reference_df:pd.DataFrame,
158
  print(out_message)
159
  #raise Exception(out_message)
160
 
161
-
162
-
163
  # Run through this x times to try to get all duplicate topics
164
  if deduplicate_topics == "Yes":
 
 
165
  for i in range(0, 8):
166
  if merge_sentiment == "No":
167
  if merge_general_topics == "No":
168
  reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
169
  reference_df_unique = reference_df.drop_duplicates("old_category")
170
 
171
- deduplicated_topic_map_df = reference_df_unique.groupby(["General Topic", "Sentiment"]).apply(
172
  lambda group: deduplicate_categories(
173
  group["Subtopic"],
174
  group["Sentiment"],
@@ -233,8 +233,6 @@ def deduplicate_topics(reference_df:pd.DataFrame,
233
  # Remove rows where 'deduplicated_category' is blank or NaN
234
  deduplicated_topic_map_df = deduplicated_topic_map_df.loc[(deduplicated_topic_map_df['deduplicated_category'].str.strip() != '') & ~(deduplicated_topic_map_df['deduplicated_category'].isnull()), ['old_category','deduplicated_category', 'match_score']]
235
 
236
- #deduplicated_topic_map_df.to_csv(output_folder + "deduplicated_topic_map_df_" + str(i) + ".csv", index=None)
237
-
238
  reference_df = reference_df.merge(deduplicated_topic_map_df, on="old_category", how="left")
239
 
240
  reference_df.rename(columns={"Subtopic": "Subtopic_old", "Sentiment": "Sentiment_old"}, inplace=True)
@@ -246,14 +244,9 @@ def deduplicate_topics(reference_df:pd.DataFrame,
246
  reference_df["Subtopic"] = reference_df["deduplicated_category"].combine_first(reference_df["Subtopic_old"])
247
  reference_df["Sentiment"] = reference_df["Sentiment"].combine_first(reference_df["Sentiment_old"])
248
 
249
-
250
- reference_df.drop(['old_category', 'deduplicated_category', "Subtopic_old", "Sentiment_old"], axis=1, inplace=True, errors="ignore")
251
-
252
- reference_df = reference_df[["Response References", "General Topic", "Subtopic", "Sentiment", "Summary", "Start row of group"]]
253
-
254
- #reference_df["General Topic"] = reference_df["General Topic"].str.lower().str.capitalize()
255
- #reference_df["Subtopic"] = reference_df["Subtopic"].str.lower().str.capitalize()
256
- #reference_df["Sentiment"] = reference_df["Sentiment"].str.lower().str.capitalize()
257
 
258
  if merge_general_topics == "Yes":
259
  # Replace General topic names for each Subtopic with that for the Subtopic with the most responses
@@ -285,8 +278,10 @@ def deduplicate_topics(reference_df:pd.DataFrame,
285
  # Clean up the DataFrame by dropping the UniqueCount column
286
  reference_df.drop(columns=['UniqueCount'], inplace=True)
287
 
288
- reference_df = reference_df[["Response References", "General Topic", "Subtopic", "Sentiment", "Summary", "Start row of group"]]
289
-
 
 
290
  # Update reference summary column with all summaries
291
  reference_df["Summary"] = reference_df.groupby(
292
  ["Response References", "General Topic", "Subtopic", "Sentiment"]
@@ -301,19 +296,17 @@ def deduplicate_topics(reference_df:pd.DataFrame,
301
  # Drop duplicates in the reference table - each comment should only have the same topic referred to once
302
  reference_df.drop_duplicates(['Response References', 'General Topic', 'Subtopic', 'Sentiment'], inplace=True)
303
 
304
-
305
  # Remake topic_summary_df based on new reference_df
306
  topic_summary_df = create_topic_summary_df_from_reference_table(reference_df)
307
 
308
  # Then merge the topic numbers back to the original dataframe
309
  reference_df = reference_df.merge(
310
- topic_summary_df[['General Topic', 'Subtopic', 'Sentiment', 'Topic_number']],
311
- on=['General Topic', 'Subtopic', 'Sentiment'],
312
  how='left'
313
  )
314
 
315
- else:
316
- print("Topics have not beeen deduplicated")
317
 
318
 
319
  if not file_data.empty:
@@ -337,13 +330,11 @@ def deduplicate_topics(reference_df:pd.DataFrame,
337
 
338
  # Outputs for markdown table output
339
  topic_summary_df_revised_display = topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
340
-
341
  deduplicated_unique_table_markdown = topic_summary_df_revised_display.to_markdown(index=False)
342
 
343
  return reference_df, topic_summary_df, output_files, log_output_files, deduplicated_unique_table_markdown
344
 
345
  def sample_reference_table_summaries(reference_df:pd.DataFrame,
346
- topic_summary_df:pd.DataFrame,
347
  random_seed:int,
348
  no_of_sampled_summaries:int=150):
349
 
@@ -354,7 +345,10 @@ def sample_reference_table_summaries(reference_df:pd.DataFrame,
354
  all_summaries = pd.DataFrame()
355
  output_files = []
356
 
357
- reference_df_grouped = reference_df.groupby(["General Topic", "Subtopic", "Sentiment"])
 
 
 
358
 
359
  if 'Revised summary' in reference_df.columns:
360
  out_message = "Summary has already been created for this file"
@@ -389,7 +383,7 @@ def sample_reference_table_summaries(reference_df:pd.DataFrame,
389
 
390
  summarised_references_markdown = summarised_references.to_markdown(index=False)
391
 
392
- return summarised_references, summarised_references_markdown, reference_df, topic_summary_df
393
 
394
  def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:float, formatted_summary_prompt:str, summarise_topic_descriptions_system_prompt:str, local_model=[]):
395
  conversation_history = []
@@ -453,7 +447,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
453
  out_metadata = []
454
  local_model = []
455
  summarised_output_markdown = ""
456
- output_files = []
457
 
458
  # Check for data for summarisations
459
  if not topic_summary_df.empty and not reference_table_df.empty:
@@ -475,11 +469,12 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
475
  out_message = "No file data found, pivot table output will not be created."
476
  print(out_message)
477
  raise Exception(out_message)
 
 
 
478
 
479
- try:
480
- all_summaries = summarised_references["Summary"].tolist()
481
- except:
482
- all_summaries = summarised_references["Revised summary"].tolist()
483
 
484
  length_all_summaries = len(all_summaries)
485
 
@@ -488,16 +483,14 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
488
  print("All summaries completed. Creating outputs.")
489
 
490
  model_choice_clean = model_name_map[model_choice]
491
- file_name = re.search(r'(.*?)(?:_batch_|_col_)', table_file_name).group(1) if re.search(r'(.*?)(?:_batch_|_col_)', table_file_name) else table_file_name
492
  latest_batch_completed = int(re.search(r'batch_(\d+)_', table_file_name).group(1)) if 'batch_' in table_file_name else ""
493
  batch_size_number = int(re.search(r'size_(\d+)_', table_file_name).group(1)) if 'size_' in table_file_name else ""
494
  in_column_cleaned = re.search(r'col_(.*?)_reference', table_file_name).group(1) if 'col_' in table_file_name else ""
495
 
496
  # Save outputs for each batch. If master file created, label file as master
497
- if latest_batch_completed:
498
- batch_file_path_details = f"{file_name}_batch_{latest_batch_completed}_size_{batch_size_number}_col_{in_column_cleaned}"
499
- else:
500
- batch_file_path_details = f"{file_name}_col_{in_column_cleaned}"
501
 
502
  summarised_references["Revised summary"] = summarised_outputs
503
 
@@ -511,7 +504,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
511
  # If no new summary is available, keep the original
512
  topic_summary_df_revised["Revised summary"] = topic_summary_df_revised["Revised summary"].combine_first(topic_summary_df_revised["Summary"])
513
 
514
- topic_summary_df_revised = topic_summary_df_revised[["General Topic", "Subtopic", "Sentiment", "Number of responses", "Revised summary"]]
515
 
516
  # Replace all instances of 'Rows X to Y:' that remain on some topics that have not had additional summaries
517
  topic_summary_df_revised["Revised summary"] = topic_summary_df_revised["Revised summary"].str.replace("^Rows\s+\d+\s+to\s+\d+:\s*", "", regex=True)
@@ -545,7 +538,6 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
545
 
546
  ###
547
  topic_summary_df_revised_display = topic_summary_df_revised.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
548
-
549
  summarised_output_markdown = topic_summary_df_revised_display.to_markdown(index=False)
550
 
551
  # Ensure same file name not returned twice
@@ -560,16 +552,14 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
560
  #print("Last summary number:", length_all_summaries)
561
 
562
  if (model_choice == CHOSEN_LOCAL_MODEL_TYPE) & (RUN_LOCAL_MODEL == "1"):
563
- progress(0.1, f"Loading in local model: {CHOSEN_LOCAL_MODEL_TYPE}")
564
- local_model, tokenizer = load_model(local_model_type=CHOSEN_LOCAL_MODEL_TYPE, repo_id=LOCAL_REPO_ID, model_filename=LOCAL_MODEL_FILE, model_dir=LOCAL_MODEL_FOLDER)
565
- #print("Local model loaded:", local_model)
566
 
567
  summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
568
  summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")
569
 
570
  if do_summaries == "Yes":
571
  for summary_no in summary_loop:
572
-
573
  print("Current summary number is:", summary_no)
574
 
575
  summary_text = all_summaries[summary_no]
@@ -631,8 +621,9 @@ def overall_summary(topic_summary_df:pd.DataFrame,
631
  latest_summary_completed = 0
632
  output_files = []
633
 
634
- model_choice_clean = model_name_map[model_choice]
635
- file_name = re.search(r'(.*?)(?:_batch_|_col_)', table_file_name).group(1) if re.search(r'(.*?)(?:_batch_|_col_)', table_file_name) else table_file_name
 
636
  latest_batch_completed = int(re.search(r'batch_(\d+)_', table_file_name).group(1)) if 'batch_' in table_file_name else ""
637
  batch_size_number = int(re.search(r'size_(\d+)_', table_file_name).group(1)) if 'size_' in table_file_name else ""
638
  in_column_cleaned = re.search(r'col_(.*?)_reference', table_file_name).group(1) if 'col_' in table_file_name else ""
@@ -659,14 +650,10 @@ def overall_summary(topic_summary_df:pd.DataFrame,
659
  if do_summaries == "Yes":
660
  for summary_no in summary_loop:
661
 
662
- print("Current summary number is:", summary_no)
663
-
664
  summary_text = topic_summary_df.to_markdown(index=False)
665
- #print("summary_text:", summary_text)
666
  formatted_summary_prompt = [summarise_everything_prompt.format(topic_summary_table=summary_text, summary_format=comprehensive_summary_format_prompt)]
667
 
668
- #print("formatted_summary_prompt:", formatted_summary_prompt)
669
-
670
  try:
671
  response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, summarise_topic_descriptions_system_prompt, local_model)
672
  summarised_output = response
@@ -687,16 +674,17 @@ def overall_summary(topic_summary_df:pd.DataFrame,
687
  toc = time.perf_counter()
688
  time_taken = tic - toc
689
 
690
- # Define the output file path for the formatted prompt
691
- formatted_prompt_output_path = output_folder + batch_file_path_details + "_overall_summary_" + model_choice_clean + ".txt"
 
692
 
693
  # Write the formatted prompt to the specified file
694
  try:
695
- with open(formatted_prompt_output_path, "w", encoding='utf-8', errors='replace') as f:
696
  f.write(summarised_output)
697
- output_files.append(formatted_prompt_output_path)
698
  except Exception as e:
699
- print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
700
 
701
  output_files = list(set(output_files))
702
 
 
9
 
10
  from tools.prompts import summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, system_prompt, summarise_everything_prompt, comprehensive_summary_format_prompt
11
  from tools.llm_funcs import construct_gemini_generative_model, process_requests, ResponseObject, load_model
12
+ from tools.helper_functions import create_topic_summary_df_from_reference_table, load_in_data_file, get_basic_response_data, convert_reference_table_to_pivot_table, wrap_text, clean_column_name
13
  from tools.config import OUTPUT_FOLDER, RUN_LOCAL_MODEL, MAX_COMMENT_CHARS, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED
14
 
15
  max_tokens = MAX_TOKENS
 
158
  print(out_message)
159
  #raise Exception(out_message)
160
 
 
 
161
  # Run through this x times to try to get all duplicate topics
162
  if deduplicate_topics == "Yes":
163
+ if "Group" not in reference_df.columns:
164
+ reference_df["Group"] = "All"
165
  for i in range(0, 8):
166
  if merge_sentiment == "No":
167
  if merge_general_topics == "No":
168
  reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
169
  reference_df_unique = reference_df.drop_duplicates("old_category")
170
 
171
+ deduplicated_topic_map_df = reference_df_unique.groupby(["General Topic", "Sentiment", "Group"]).apply(
172
  lambda group: deduplicate_categories(
173
  group["Subtopic"],
174
  group["Sentiment"],
 
233
  # Remove rows where 'deduplicated_category' is blank or NaN
234
  deduplicated_topic_map_df = deduplicated_topic_map_df.loc[(deduplicated_topic_map_df['deduplicated_category'].str.strip() != '') & ~(deduplicated_topic_map_df['deduplicated_category'].isnull()), ['old_category','deduplicated_category', 'match_score']]
235
 
 
 
236
  reference_df = reference_df.merge(deduplicated_topic_map_df, on="old_category", how="left")
237
 
238
  reference_df.rename(columns={"Subtopic": "Subtopic_old", "Sentiment": "Sentiment_old"}, inplace=True)
 
244
  reference_df["Subtopic"] = reference_df["deduplicated_category"].combine_first(reference_df["Subtopic_old"])
245
  reference_df["Sentiment"] = reference_df["Sentiment"].combine_first(reference_df["Sentiment_old"])
246
 
247
+ #reference_df.drop(['old_category', 'deduplicated_category', "Subtopic_old", "Sentiment_old"], axis=1, inplace=True, errors="ignore")
248
+ #print("reference_df:", reference_df)
249
+ reference_df = reference_df[["Response References", "General Topic", "Subtopic", "Sentiment", "Summary", "Start row of group", "Group"]]
 
 
 
 
 
250
 
251
  if merge_general_topics == "Yes":
252
  # Replace General topic names for each Subtopic with that for the Subtopic with the most responses
 
278
  # Clean up the DataFrame by dropping the UniqueCount column
279
  reference_df.drop(columns=['UniqueCount'], inplace=True)
280
 
281
+ #print("reference_df:", reference_df)
282
+ reference_df = reference_df[["Response References", "General Topic", "Subtopic", "Sentiment", "Summary", "Start row of group", "Group"]]
283
+ #reference_df.drop(['old_category', 'deduplicated_category', "Subtopic_old", "Sentiment_old"], axis=1, inplace=True, errors="ignore")
284
+
285
  # Update reference summary column with all summaries
286
  reference_df["Summary"] = reference_df.groupby(
287
  ["Response References", "General Topic", "Subtopic", "Sentiment"]
 
296
  # Drop duplicates in the reference table - each comment should only have the same topic referred to once
297
  reference_df.drop_duplicates(['Response References', 'General Topic', 'Subtopic', 'Sentiment'], inplace=True)
298
 
 
299
  # Remake topic_summary_df based on new reference_df
300
  topic_summary_df = create_topic_summary_df_from_reference_table(reference_df)
301
 
302
  # Then merge the topic numbers back to the original dataframe
303
  reference_df = reference_df.merge(
304
+ topic_summary_df[['General Topic', 'Subtopic', 'Sentiment', 'Group', 'Topic_number']],
305
+ on=['General Topic', 'Subtopic', 'Sentiment', 'Group'],
306
  how='left'
307
  )
308
 
309
+ else: print("Topics have not beeen deduplicated")
 
310
 
311
 
312
  if not file_data.empty:
 
330
 
331
  # Outputs for markdown table output
332
  topic_summary_df_revised_display = topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
 
333
  deduplicated_unique_table_markdown = topic_summary_df_revised_display.to_markdown(index=False)
334
 
335
  return reference_df, topic_summary_df, output_files, log_output_files, deduplicated_unique_table_markdown
336
 
337
  def sample_reference_table_summaries(reference_df:pd.DataFrame,
 
338
  random_seed:int,
339
  no_of_sampled_summaries:int=150):
340
 
 
345
  all_summaries = pd.DataFrame()
346
  output_files = []
347
 
348
+ if "Group" not in reference_df.columns:
349
+ reference_df["Group"] = "All"
350
+
351
+ reference_df_grouped = reference_df.groupby(["General Topic", "Subtopic", "Sentiment", "Group"])
352
 
353
  if 'Revised summary' in reference_df.columns:
354
  out_message = "Summary has already been created for this file"
 
383
 
384
  summarised_references_markdown = summarised_references.to_markdown(index=False)
385
 
386
+ return summarised_references, summarised_references_markdown#, reference_df, topic_summary_df
387
 
388
  def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:float, formatted_summary_prompt:str, summarise_topic_descriptions_system_prompt:str, local_model=[]):
389
  conversation_history = []
 
447
  out_metadata = []
448
  local_model = []
449
  summarised_output_markdown = ""
450
+ output_files = []
451
 
452
  # Check for data for summarisations
453
  if not topic_summary_df.empty and not reference_table_df.empty:
 
469
  out_message = "No file data found, pivot table output will not be created."
470
  print(out_message)
471
  raise Exception(out_message)
472
+
473
+ if "Group" not in reference_table_df.columns: reference_table_df["Group"] = "All"
474
+ if "Group" not in topic_summary_df.columns: topic_summary_df["Group"] = "All"
475
 
476
+ try: all_summaries = summarised_references["Summary"].tolist()
477
+ except: all_summaries = summarised_references["Revised summary"].tolist()
 
 
478
 
479
  length_all_summaries = len(all_summaries)
480
 
 
483
  print("All summaries completed. Creating outputs.")
484
 
485
  model_choice_clean = model_name_map[model_choice]
486
+ file_name = re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name).group(1) if re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name) else table_file_name
487
  latest_batch_completed = int(re.search(r'batch_(\d+)_', table_file_name).group(1)) if 'batch_' in table_file_name else ""
488
  batch_size_number = int(re.search(r'size_(\d+)_', table_file_name).group(1)) if 'size_' in table_file_name else ""
489
  in_column_cleaned = re.search(r'col_(.*?)_reference', table_file_name).group(1) if 'col_' in table_file_name else ""
490
 
491
  # Save outputs for each batch. If master file created, label file as master
492
+ if latest_batch_completed: batch_file_path_details = f"{file_name}_batch_{latest_batch_completed}_size_{batch_size_number}_col_{in_column_cleaned}"
493
+ else: batch_file_path_details = f"{file_name}_col_{in_column_cleaned}"
 
 
494
 
495
  summarised_references["Revised summary"] = summarised_outputs
496
 
 
504
  # If no new summary is available, keep the original
505
  topic_summary_df_revised["Revised summary"] = topic_summary_df_revised["Revised summary"].combine_first(topic_summary_df_revised["Summary"])
506
 
507
+ topic_summary_df_revised = topic_summary_df_revised[["General Topic", "Subtopic", "Sentiment", "Group", "Number of responses", "Revised summary"]]
508
 
509
  # Replace all instances of 'Rows X to Y:' that remain on some topics that have not had additional summaries
510
  topic_summary_df_revised["Revised summary"] = topic_summary_df_revised["Revised summary"].str.replace("^Rows\s+\d+\s+to\s+\d+:\s*", "", regex=True)
 
538
 
539
  ###
540
  topic_summary_df_revised_display = topic_summary_df_revised.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
 
541
  summarised_output_markdown = topic_summary_df_revised_display.to_markdown(index=False)
542
 
543
  # Ensure same file name not returned twice
 
552
  #print("Last summary number:", length_all_summaries)
553
 
554
  if (model_choice == CHOSEN_LOCAL_MODEL_TYPE) & (RUN_LOCAL_MODEL == "1"):
555
+ progress(0.1, f"Loading in local model: {CHOSEN_LOCAL_MODEL_TYPE}")
556
+ local_model, tokenizer = load_model(local_model_type=CHOSEN_LOCAL_MODEL_TYPE, repo_id=LOCAL_REPO_ID, model_filename=LOCAL_MODEL_FILE, model_dir=LOCAL_MODEL_FOLDER)
 
557
 
558
  summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
559
  summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")
560
 
561
  if do_summaries == "Yes":
562
  for summary_no in summary_loop:
 
563
  print("Current summary number is:", summary_no)
564
 
565
  summary_text = all_summaries[summary_no]
 
621
  latest_summary_completed = 0
622
  output_files = []
623
 
624
+ model_choice_clean = model_name_map[model_choice]
625
+ model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
626
+ file_name = re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name).group(1) if re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name) else table_file_name
627
  latest_batch_completed = int(re.search(r'batch_(\d+)_', table_file_name).group(1)) if 'batch_' in table_file_name else ""
628
  batch_size_number = int(re.search(r'size_(\d+)_', table_file_name).group(1)) if 'size_' in table_file_name else ""
629
  in_column_cleaned = re.search(r'col_(.*?)_reference', table_file_name).group(1) if 'col_' in table_file_name else ""
 
650
  if do_summaries == "Yes":
651
  for summary_no in summary_loop:
652
 
 
 
653
  summary_text = topic_summary_df.to_markdown(index=False)
654
+
655
  formatted_summary_prompt = [summarise_everything_prompt.format(topic_summary_table=summary_text, summary_format=comprehensive_summary_format_prompt)]
656
 
 
 
657
  try:
658
  response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, summarise_topic_descriptions_system_prompt, local_model)
659
  summarised_output = response
 
674
  toc = time.perf_counter()
675
  time_taken = tic - toc
676
 
677
+ # Define the output file path for the output
678
+ print("batch_file_path_details just before save:", batch_file_path_details)
679
+ overall_summary_output_path = output_folder + batch_file_path_details + "_overall_summary_" + model_choice_clean_short + "_temp_" + str(temperature) + ".txt"
680
 
681
  # Write the formatted prompt to the specified file
682
  try:
683
+ with open(overall_summary_output_path, "w", encoding='utf-8', errors='replace') as f:
684
  f.write(summarised_output)
685
+ output_files.append(overall_summary_output_path)
686
  except Exception as e:
687
+ print(f"Error writing prompt to file {overall_summary_output_path}: {e}")
688
 
689
  output_files = list(set(output_files))
690
 
tools/helper_functions.py CHANGED
@@ -32,7 +32,6 @@ def empty_output_vars_extract_topics():
32
 
33
  return master_topic_df_state, master_topic_summary_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown, summary_output_file_list, summary_input_file_list, overall_summarisation_input_files, overall_summary_output_files
34
 
35
-
36
  def empty_output_vars_summarise():
37
  # Empty output objects before summarising files
38
 
@@ -47,7 +46,7 @@ def empty_output_vars_summarise():
47
 
48
  return summary_reference_table_sample_state, master_topic_summary_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, overall_summarisation_input_files
49
 
50
- def get_or_create_env_var(var_name, default_value):
51
  # Get the environment variable if it exists
52
  value = os.environ.get(var_name)
53
 
@@ -58,14 +57,14 @@ def get_or_create_env_var(var_name, default_value):
58
 
59
  return value
60
 
61
- def get_file_path_with_extension(file_path):
62
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
63
  basename = os.path.basename(file_path)
64
 
65
  # Return the basename with its extension
66
  return basename
67
 
68
- def get_file_name_no_ext(file_path):
69
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
70
  basename = os.path.basename(file_path)
71
 
@@ -76,7 +75,7 @@ def get_file_name_no_ext(file_path):
76
 
77
  return filename_without_extension
78
 
79
- def detect_file_type(filename):
80
  """Detect the file type based on its extension."""
81
  if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
82
  return 'csv'
@@ -232,7 +231,6 @@ def join_cols_onto_reference_df(reference_df:pd.DataFrame, original_data_df:pd.D
232
 
233
  return out_reference_df, file_data_outputs
234
 
235
-
236
  def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str], verify_titles:bool=False) -> pd.DataFrame:
237
 
238
  if not isinstance(chosen_cols, list):
@@ -253,9 +251,7 @@ def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str], verif
253
  basic_response_data = basic_response_data[['Reference', 'Response', 'Original Reference']]
254
 
255
  basic_response_data["Response"] = basic_response_data["Response"].str.strip()
256
- basic_response_data["Response"] = basic_response_data["Response"].apply(initial_clean)
257
-
258
- print("basic_response_data:", basic_response_data)
259
 
260
  return basic_response_data
261
 
@@ -291,7 +287,10 @@ def convert_reference_table_to_pivot_table(df:pd.DataFrame, basic_response_data:
291
 
292
  def create_topic_summary_df_from_reference_table(reference_df:pd.DataFrame):
293
 
294
- out_topic_summary_df = (reference_df.groupby(["General Topic", "Subtopic", "Sentiment"])
 
 
 
295
  .agg({
296
  'Response References': 'size', # Count the number of references
297
  'Summary': lambda x: '<br>'.join(
@@ -299,12 +298,14 @@ def create_topic_summary_df_from_reference_table(reference_df:pd.DataFrame):
299
  )
300
  })
301
  .reset_index()
302
- .sort_values('Response References', ascending=False) # Sort by size, biggest first
303
  .assign(Topic_number=lambda df: np.arange(1, len(df) + 1)) # Add numbering 1 to x
304
  )
305
 
306
  out_topic_summary_df = out_topic_summary_df.rename(columns={"Response References": "Number of responses"}, errors="ignore")
307
 
 
 
308
  return out_topic_summary_df
309
 
310
  # Wrap text in each column to the specified max width, including whole words
 
32
 
33
  return master_topic_df_state, master_topic_summary_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown, summary_output_file_list, summary_input_file_list, overall_summarisation_input_files, overall_summary_output_files
34
 
 
35
  def empty_output_vars_summarise():
36
  # Empty output objects before summarising files
37
 
 
46
 
47
  return summary_reference_table_sample_state, master_topic_summary_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, overall_summarisation_input_files
48
 
49
+ def get_or_create_env_var(var_name:str, default_value:str):
50
  # Get the environment variable if it exists
51
  value = os.environ.get(var_name)
52
 
 
57
 
58
  return value
59
 
60
+ def get_file_path_with_extension(file_path:str):
61
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
62
  basename = os.path.basename(file_path)
63
 
64
  # Return the basename with its extension
65
  return basename
66
 
67
+ def get_file_name_no_ext(file_path:str):
68
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
69
  basename = os.path.basename(file_path)
70
 
 
75
 
76
  return filename_without_extension
77
 
78
+ def detect_file_type(filename:str):
79
  """Detect the file type based on its extension."""
80
  if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
81
  return 'csv'
 
231
 
232
  return out_reference_df, file_data_outputs
233
 
 
234
  def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str], verify_titles:bool=False) -> pd.DataFrame:
235
 
236
  if not isinstance(chosen_cols, list):
 
251
  basic_response_data = basic_response_data[['Reference', 'Response', 'Original Reference']]
252
 
253
  basic_response_data["Response"] = basic_response_data["Response"].str.strip()
254
+ basic_response_data["Response"] = basic_response_data["Response"].apply(initial_clean)
 
 
255
 
256
  return basic_response_data
257
 
 
287
 
288
  def create_topic_summary_df_from_reference_table(reference_df:pd.DataFrame):
289
 
290
+ if "Group" not in reference_df.columns:
291
+ reference_df["Group"] = "All"
292
+
293
+ out_topic_summary_df = (reference_df.groupby(["General Topic", "Subtopic", "Sentiment", "Group"])
294
  .agg({
295
  'Response References': 'size', # Count the number of references
296
  'Summary': lambda x: '<br>'.join(
 
298
  )
299
  })
300
  .reset_index()
301
+ #.sort_values('Response References', ascending=False) # Sort by size, biggest first
302
  .assign(Topic_number=lambda df: np.arange(1, len(df) + 1)) # Add numbering 1 to x
303
  )
304
 
305
  out_topic_summary_df = out_topic_summary_df.rename(columns={"Response References": "Number of responses"}, errors="ignore")
306
 
307
+ out_topic_summary_df = out_topic_summary_df.sort_values(["Group", "Number of responses", "General Topic", "Subtopic", "Sentiment"], ascending=[True, False, True, True, True])
308
+
309
  return out_topic_summary_df
310
 
311
  # Wrap text in each column to the specified max width, including whole words
tools/llm_api_call.py CHANGED
@@ -421,7 +421,6 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
421
  reference_table_out_path = "reference_table_error.csv"
422
  topic_summary_df_out_path = "unique_topic_table_error.csv"
423
  topic_with_response_df = pd.DataFrame()
424
- markdown_table = ""
425
  out_reference_df = pd.DataFrame()
426
  out_topic_summary_df = pd.DataFrame()
427
  batch_file_path_details = "error"
@@ -461,7 +460,7 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
461
  topic_with_response_df, is_error = convert_response_text_to_dataframe(response_text)
462
  except Exception as e:
463
  print("Error in parsing markdown table from response text:", e)
464
- return topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_topic_summary_df, batch_file_path_details, is_error
465
 
466
  # Rename columns to ensure consistent use of data frames later in code
467
  new_column_names = {
@@ -607,9 +606,11 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
607
 
608
  out_topic_summary_df = out_topic_summary_df.rename(columns={"Response References":"Number of responses"}, errors="ignore")
609
 
 
 
610
  topic_summary_df_out_path = output_folder + batch_file_path_details + "_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
611
 
612
- return topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_topic_summary_df, batch_file_path_details, is_error
613
 
614
  def generate_zero_shot_topics_df(zero_shot_topics:pd.DataFrame,
615
  force_zero_shot_radio:str="No",
@@ -988,7 +989,7 @@ def extract_topics(in_data_file,
988
  full_prompt = formatted_system_prompt + formatted_summary_prompt
989
 
990
  # Define the output file path for the formatted prompt
991
- formatted_prompt_output_path = output_folder + clean_column_name(file_name, max_length=30, front_characters=False) + "_" + str(reported_batch_no) + "_full_prompt_" + clean_column_name(model_choice_clean, max_length = 20, front_characters=False) + "_temp_" + str(temperature) + ".txt"
992
 
993
  # Write the formatted prompt to the specified file
994
  try:
@@ -1009,7 +1010,7 @@ def extract_topics(in_data_file,
1009
  responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
1010
 
1011
  # Return output tables
1012
- topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, new_topic_df, new_markdown_table, new_reference_df, new_topic_summary_df, master_batch_out_file_part, is_error = write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, group_name, first_run=False, output_folder=output_folder)
1013
 
1014
  # Write final output to text file for logging purposes
1015
  try:
@@ -1046,6 +1047,8 @@ def extract_topics(in_data_file,
1046
  ## Unique topic list
1047
  new_topic_summary_df = pd.concat([new_topic_summary_df, existing_topic_summary_df]).drop_duplicates('Subtopic')
1048
 
 
 
1049
  new_topic_summary_df.to_csv(topic_summary_df_out_path, index=None)
1050
  out_file_paths.append(topic_summary_df_out_path)
1051
 
@@ -1101,7 +1104,7 @@ def extract_topics(in_data_file,
1101
  responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS)
1102
 
1103
 
1104
- topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_table_df, markdown_table, reference_df, new_topic_summary_df, batch_file_path_details, is_error = write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, group_name, first_run=True, output_folder=output_folder)
1105
 
1106
  # If error in table parsing, leave function
1107
  if is_error == True:
@@ -1121,6 +1124,8 @@ def extract_topics(in_data_file,
1121
 
1122
  new_topic_summary_df = pd.concat([new_topic_summary_df, existing_topic_summary_df]).drop_duplicates('Subtopic')
1123
 
 
 
1124
  new_topic_summary_df.to_csv(topic_summary_df_out_path, index=None)
1125
  out_file_paths.append(topic_summary_df_out_path)
1126
 
@@ -1131,20 +1136,23 @@ def extract_topics(in_data_file,
1131
 
1132
  # Write final output to text file also
1133
  try:
1134
- final_table_output_path = output_folder + batch_file_path_details + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
1135
-
1136
- if isinstance(responses[-1], ResponseObject):
1137
- with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1138
- f.write(responses[-1].text)
1139
- unique_table_df_display_table_markdown = responses[-1].text
1140
- elif "choices" in responses[-1]:
1141
- with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1142
- f.write(responses[-1]["choices"][0]['text'])
1143
- unique_table_df_display_table_markdown =responses[-1]["choices"][0]['text']
1144
- else:
1145
- with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1146
- f.write(responses[-1].text)
1147
- unique_table_df_display_table_markdown = responses[-1].text
 
 
 
1148
 
1149
  log_files_output_paths.append(final_table_output_path)
1150
 
@@ -1203,7 +1211,7 @@ def extract_topics(in_data_file,
1203
 
1204
  print("All summaries completed. Creating outputs.")
1205
 
1206
- model_choice_clean = clean_column_name(model_name_map[model_choice], max_length=20, front_characters=False)
1207
  # Example usage
1208
  in_column_cleaned = clean_column_name(chosen_cols, max_length=20)
1209
 
@@ -1214,14 +1222,13 @@ def extract_topics(in_data_file,
1214
  file_path_details = f"{file_name_cleaned}_col_{in_column_cleaned}"
1215
 
1216
  # Create a pivoted reference table
1217
- existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
1218
 
1219
  # Save the new DataFrame to CSV
1220
- #topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1221
- reference_table_out_pivot_path = output_folder + file_path_details + "_final_reference_table_pivot_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1222
- reference_table_out_path = output_folder + file_path_details + "_final_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1223
- topic_summary_df_out_path = output_folder + file_path_details + "_final_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1224
- basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1225
 
1226
  ## Reference table mapping response numbers to topics
1227
  existing_reference_df.to_csv(reference_table_out_path, index=None)
@@ -1230,30 +1237,33 @@ def extract_topics(in_data_file,
1230
 
1231
  # Create final unique topics table from reference table to ensure consistent numbers
1232
  final_out_topic_summary_df = create_topic_summary_df_from_reference_table(existing_reference_df)
 
1233
 
1234
  ## Unique topic list
1235
  final_out_topic_summary_df.to_csv(topic_summary_df_out_path, index=None, encoding='utf-8')
1236
  out_file_paths.append(topic_summary_df_out_path)
1237
 
 
 
 
 
1238
  # Ensure that we are only returning the final results to outputs
1239
  out_file_paths = [x for x in out_file_paths if '_final_' in x]
1240
 
1241
  ## Reference table mapping response numbers to topics
 
1242
  existing_reference_df_pivot.to_csv(reference_table_out_pivot_path, index = None, encoding='utf-8')
1243
  log_files_output_paths.append(reference_table_out_pivot_path)
1244
 
1245
  ## Create a dataframe for missing response references:
1246
  # Assuming existing_reference_df and file_data are already defined
1247
- # Simplify table to just responses column and the Response reference number
1248
-
1249
  basic_response_data = get_basic_response_data(file_data, chosen_cols)
1250
 
1251
-
1252
  # Save simplified file data to log outputs
1253
  pd.DataFrame(basic_response_data).to_csv(basic_response_data_out_path, index=None, encoding='utf-8')
1254
  log_files_output_paths.append(basic_response_data_out_path)
1255
 
1256
-
1257
  # Step 1: Identify missing references
1258
  missing_references = basic_response_data[~basic_response_data['Reference'].astype(str).isin(existing_reference_df['Response References'].astype(str).unique())]
1259
 
@@ -1267,7 +1277,7 @@ def extract_topics(in_data_file,
1267
  # Display the new DataFrame
1268
  #print("missing_df:", missing_df)
1269
 
1270
- missing_df_out_path = output_folder + file_path_details + "_missing_references_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
1271
  missing_df.to_csv(missing_df_out_path, index=None, encoding='utf-8')
1272
  log_files_output_paths.append(missing_df_out_path)
1273
 
@@ -1281,10 +1291,10 @@ def extract_topics(in_data_file,
1281
 
1282
  print("latest_batch_completed at end of batch iterations to return is", latest_batch_completed)
1283
 
1284
- return unique_table_df_display_table_markdown, existing_topics_table, final_out_topic_summary_df, existing_reference_df, final_out_file_paths, final_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, final_out_file_paths, final_out_file_paths, modifiable_topic_summary_df, final_out_file_paths, join_file_paths # gr.Dataframe(value=modifiable_topic_summary_df, headers=None, col_count=(modifiable_topic_summary_df.shape[1], "fixed"), row_count = (modifiable_topic_summary_df.shape[0], "fixed"), visible=True, type="pandas"),
1285
 
1286
 
1287
- return unique_table_df_display_table_markdown, existing_topics_table, existing_topic_summary_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, modifiable_topic_summary_df, out_file_paths, join_file_paths # gr.Dataframe(value=modifiable_topic_summary_df, headers=None, col_count=(modifiable_topic_summary_df.shape[1], "fixed"), row_count = (modifiable_topic_summary_df.shape[0], "fixed"), visible=True, type="pandas"),
1288
 
1289
  def wrapper_extract_topics_per_column_value(
1290
  selected_col: str,
@@ -1350,6 +1360,7 @@ def wrapper_extract_topics_per_column_value(
1350
  acc_topics_table = initial_existing_topics_table.copy()
1351
  acc_reference_df = initial_existing_reference_df.copy()
1352
  acc_topic_summary_df = initial_existing_topic_summary_df.copy()
 
1353
 
1354
  # Lists are extended
1355
  acc_out_file_paths = []
@@ -1365,7 +1376,7 @@ def wrapper_extract_topics_per_column_value(
1365
 
1366
  wrapper_first_loop = initial_first_loop_state
1367
 
1368
- for i, group_value in enumerate(unique_values):
1369
  print(f"\nProcessing segment: {selected_col} = {group_value} ({i+1}/{len(unique_values)})")
1370
 
1371
  filtered_file_data = file_data.copy()
@@ -1412,6 +1423,7 @@ def wrapper_extract_topics_per_column_value(
1412
  seg_gradio_df,
1413
  _seg_out_files5, # Often same as 1
1414
  seg_join_files,
 
1415
  ) = extract_topics(
1416
  in_data_file=in_data_file,
1417
  file_data=filtered_file_data,
@@ -1460,9 +1472,9 @@ def wrapper_extract_topics_per_column_value(
1460
  # Aggregate results
1461
  # The DFs returned by extract_topics are already cumulative for *its own run*.
1462
  # We now make them cumulative for the *wrapper's run*.
1463
- acc_topics_table = seg_topics_table
1464
- acc_reference_df = seg_reference_df
1465
- acc_topic_summary_df = seg_topic_summary_df
1466
 
1467
  # For lists, extend. Use set to remove duplicates if paths might be re-added.
1468
  acc_out_file_paths.extend(f for f in seg_out_files1 if f not in acc_out_file_paths)
@@ -1484,6 +1496,32 @@ def wrapper_extract_topics_per_column_value(
1484
  # Optionally, decide if you want to continue with other segments or stop
1485
  # For now, it will continue
1486
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1487
 
1488
  print(f"\nWrapper finished processing all segments. Total time: {acc_total_time_taken:.2f}s")
1489
 
 
421
  reference_table_out_path = "reference_table_error.csv"
422
  topic_summary_df_out_path = "unique_topic_table_error.csv"
423
  topic_with_response_df = pd.DataFrame()
 
424
  out_reference_df = pd.DataFrame()
425
  out_topic_summary_df = pd.DataFrame()
426
  batch_file_path_details = "error"
 
460
  topic_with_response_df, is_error = convert_response_text_to_dataframe(response_text)
461
  except Exception as e:
462
  print("Error in parsing markdown table from response text:", e)
463
+ return topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_with_response_df, out_reference_df, out_topic_summary_df, batch_file_path_details, is_error
464
 
465
  # Rename columns to ensure consistent use of data frames later in code
466
  new_column_names = {
 
606
 
607
  out_topic_summary_df = out_topic_summary_df.rename(columns={"Response References":"Number of responses"}, errors="ignore")
608
 
609
+ out_topic_summary_df["Group"] = group_name
610
+
611
  topic_summary_df_out_path = output_folder + batch_file_path_details + "_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
612
 
613
+ return topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_with_response_df, out_reference_df, out_topic_summary_df, batch_file_path_details, is_error
614
 
615
  def generate_zero_shot_topics_df(zero_shot_topics:pd.DataFrame,
616
  force_zero_shot_radio:str="No",
 
989
  full_prompt = formatted_system_prompt + formatted_summary_prompt
990
 
991
  # Define the output file path for the formatted prompt
992
+ formatted_prompt_output_path = output_folder + batch_file_path_details + "_full_prompt_" + clean_column_name(model_choice_clean, max_length = 20, front_characters=False) + "_temp_" + str(temperature) + ".txt"
993
 
994
  # Write the formatted prompt to the specified file
995
  try:
 
1010
  responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
1011
 
1012
  # Return output tables
1013
+ topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, new_topic_df, new_reference_df, new_topic_summary_df, master_batch_out_file_part, is_error = write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, group_name, first_run=False, output_folder=output_folder)
1014
 
1015
  # Write final output to text file for logging purposes
1016
  try:
 
1047
  ## Unique topic list
1048
  new_topic_summary_df = pd.concat([new_topic_summary_df, existing_topic_summary_df]).drop_duplicates('Subtopic')
1049
 
1050
+ new_topic_summary_df["Group"] = group_name
1051
+
1052
  new_topic_summary_df.to_csv(topic_summary_df_out_path, index=None)
1053
  out_file_paths.append(topic_summary_df_out_path)
1054
 
 
1104
  responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS)
1105
 
1106
 
1107
+ topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_table_df, reference_df, new_topic_summary_df, batch_file_path_details, is_error = write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, group_name, first_run=True, output_folder=output_folder)
1108
 
1109
  # If error in table parsing, leave function
1110
  if is_error == True:
 
1124
 
1125
  new_topic_summary_df = pd.concat([new_topic_summary_df, existing_topic_summary_df]).drop_duplicates('Subtopic')
1126
 
1127
+ new_topic_summary_df["Group"] = group_name
1128
+
1129
  new_topic_summary_df.to_csv(topic_summary_df_out_path, index=None)
1130
  out_file_paths.append(topic_summary_df_out_path)
1131
 
 
1136
 
1137
  # Write final output to text file also
1138
  try:
1139
+ final_table_output_path = output_folder + batch_file_path_details + "_full_final_response_" + clean_column_name(model_choice_clean, max_length = 20, front_characters=False) + "_temp_" + str(temperature) + ".txt"
1140
+
1141
+ # if isinstance(responses[-1], ResponseObject):
1142
+ # with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1143
+ # f.write(responses[-1].text)
1144
+ # unique_table_df_display_table_markdown = responses[-1].text
1145
+ # elif "choices" in responses[-1]:
1146
+ # with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1147
+ # f.write(responses[-1]["choices"][0]['text'])
1148
+ # unique_table_df_display_table_markdown =responses[-1]["choices"][0]['text']
1149
+ # else:
1150
+ # with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
1151
+ # f.write(responses[-1].text)
1152
+ # unique_table_df_display_table_markdown = responses[-1].text
1153
+
1154
+ unique_table_df_display_table = new_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
1155
+ unique_table_df_display_table_markdown = unique_table_df_display_table[["General Topic", "Subtopic", "Sentiment", "Number of responses", "Summary"]].to_markdown(index=False)
1156
 
1157
  log_files_output_paths.append(final_table_output_path)
1158
 
 
1211
 
1212
  print("All summaries completed. Creating outputs.")
1213
 
1214
+ model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
1215
  # Example usage
1216
  in_column_cleaned = clean_column_name(chosen_cols, max_length=20)
1217
 
 
1222
  file_path_details = f"{file_name_cleaned}_col_{in_column_cleaned}"
1223
 
1224
  # Create a pivoted reference table
1225
+ existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
1226
 
1227
  # Save the new DataFrame to CSV
1228
+ reference_table_out_pivot_path = output_folder + file_path_details + "_final_reference_table_pivot_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
1229
+ reference_table_out_path = output_folder + file_path_details + "_final_reference_table_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
1230
+ topic_summary_df_out_path = output_folder + file_path_details + "_final_unique_topics_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
1231
+ basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
 
1232
 
1233
  ## Reference table mapping response numbers to topics
1234
  existing_reference_df.to_csv(reference_table_out_path, index=None)
 
1237
 
1238
  # Create final unique topics table from reference table to ensure consistent numbers
1239
  final_out_topic_summary_df = create_topic_summary_df_from_reference_table(existing_reference_df)
1240
+ final_out_topic_summary_df["Group"] = group_name
1241
 
1242
  ## Unique topic list
1243
  final_out_topic_summary_df.to_csv(topic_summary_df_out_path, index=None, encoding='utf-8')
1244
  out_file_paths.append(topic_summary_df_out_path)
1245
 
1246
+ # Outputs for markdown table output
1247
+ unique_table_df_display_table = final_out_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
1248
+ unique_table_df_display_table_markdown = unique_table_df_display_table[["General Topic", "Subtopic", "Sentiment", "Number of responses", "Summary", "Group"]].to_markdown(index=False)
1249
+
1250
  # Ensure that we are only returning the final results to outputs
1251
  out_file_paths = [x for x in out_file_paths if '_final_' in x]
1252
 
1253
  ## Reference table mapping response numbers to topics
1254
+ existing_reference_df_pivot["Group"] = group_name
1255
  existing_reference_df_pivot.to_csv(reference_table_out_pivot_path, index = None, encoding='utf-8')
1256
  log_files_output_paths.append(reference_table_out_pivot_path)
1257
 
1258
  ## Create a dataframe for missing response references:
1259
  # Assuming existing_reference_df and file_data are already defined
1260
+ # Simplify table to just responses column and the Response reference number
 
1261
  basic_response_data = get_basic_response_data(file_data, chosen_cols)
1262
 
 
1263
  # Save simplified file data to log outputs
1264
  pd.DataFrame(basic_response_data).to_csv(basic_response_data_out_path, index=None, encoding='utf-8')
1265
  log_files_output_paths.append(basic_response_data_out_path)
1266
 
 
1267
  # Step 1: Identify missing references
1268
  missing_references = basic_response_data[~basic_response_data['Reference'].astype(str).isin(existing_reference_df['Response References'].astype(str).unique())]
1269
 
 
1277
  # Display the new DataFrame
1278
  #print("missing_df:", missing_df)
1279
 
1280
+ missing_df_out_path = output_folder + file_path_details + "_missing_references_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
1281
  missing_df.to_csv(missing_df_out_path, index=None, encoding='utf-8')
1282
  log_files_output_paths.append(missing_df_out_path)
1283
 
 
1291
 
1292
  print("latest_batch_completed at end of batch iterations to return is", latest_batch_completed)
1293
 
1294
+ return unique_table_df_display_table_markdown, existing_topics_table, final_out_topic_summary_df, existing_reference_df, final_out_file_paths, final_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, final_out_file_paths, final_out_file_paths, modifiable_topic_summary_df, final_out_file_paths, join_file_paths, existing_reference_df_pivot # gr.Dataframe(value=modifiable_topic_summary_df, headers=None, col_count=(modifiable_topic_summary_df.shape[1], "fixed"), row_count = (modifiable_topic_summary_df.shape[0], "fixed"), visible=True, type="pandas"),
1295
 
1296
 
1297
+ return unique_table_df_display_table_markdown, existing_topics_table, existing_topic_summary_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, modifiable_topic_summary_df, out_file_paths, join_file_paths, existing_reference_df_pivot # gr.Dataframe(value=modifiable_topic_summary_df, headers=None, col_count=(modifiable_topic_summary_df.shape[1], "fixed"), row_count = (modifiable_topic_summary_df.shape[0], "fixed"), visible=True, type="pandas"),
1298
 
1299
  def wrapper_extract_topics_per_column_value(
1300
  selected_col: str,
 
1360
  acc_topics_table = initial_existing_topics_table.copy()
1361
  acc_reference_df = initial_existing_reference_df.copy()
1362
  acc_topic_summary_df = initial_existing_topic_summary_df.copy()
1363
+ acc_reference_df_pivot = pd.DataFrame()
1364
 
1365
  # Lists are extended
1366
  acc_out_file_paths = []
 
1376
 
1377
  wrapper_first_loop = initial_first_loop_state
1378
 
1379
+ for i, group_value in tqdm(enumerate(unique_values), desc=f"Analysing by group", total=len(unique_values), unit="groups"):
1380
  print(f"\nProcessing segment: {selected_col} = {group_value} ({i+1}/{len(unique_values)})")
1381
 
1382
  filtered_file_data = file_data.copy()
 
1423
  seg_gradio_df,
1424
  _seg_out_files5, # Often same as 1
1425
  seg_join_files,
1426
+ seg_reference_df_pivot
1427
  ) = extract_topics(
1428
  in_data_file=in_data_file,
1429
  file_data=filtered_file_data,
 
1472
  # Aggregate results
1473
  # The DFs returned by extract_topics are already cumulative for *its own run*.
1474
  # We now make them cumulative for the *wrapper's run*.
1475
+ acc_reference_df = pd.concat([acc_reference_df, seg_reference_df])
1476
+ acc_topic_summary_df = pd.concat([acc_topic_summary_df, seg_topic_summary_df])
1477
+ acc_reference_df_pivot = pd.concat([acc_reference_df_pivot, seg_reference_df_pivot])
1478
 
1479
  # For lists, extend. Use set to remove duplicates if paths might be re-added.
1480
  acc_out_file_paths.extend(f for f in seg_out_files1 if f not in acc_out_file_paths)
 
1496
  # Optionally, decide if you want to continue with other segments or stop
1497
  # For now, it will continue
1498
  continue
1499
+
1500
+ if "Group" in acc_reference_df.columns:
1501
+ model_choice_clean = model_name_map[model_choice]
1502
+ model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
1503
+ overall_file_name = f"{clean_column_name(original_file_name, max_length=30)}_"
1504
+
1505
+ acc_reference_df_path = output_folder + overall_file_name + "all_reference_table_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
1506
+ acc_topic_summary_df_path = output_folder + overall_file_name + "all_unique_topics_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
1507
+ acc_reference_df_pivot_path = output_folder + overall_file_name + "all_reference_pivot_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
1508
+
1509
+ acc_reference_df.to_csv(acc_reference_df_path, index=None)
1510
+ acc_topic_summary_df.to_csv(acc_topic_summary_df_path, index=None)
1511
+ acc_reference_df_pivot.to_csv(acc_reference_df_pivot_path, index=None)
1512
+
1513
+ # Remove the existing output file list and replace with the updated concatenated outputs
1514
+ substring_list_to_remove = ["_final_reference_table_pivot_", "_final_reference_table_", "_final_unique_topics_"]
1515
+ acc_out_file_paths = [
1516
+ x for x in acc_out_file_paths
1517
+ if not any(sub in x for sub in substring_list_to_remove)
1518
+ ]
1519
+
1520
+ acc_out_file_paths.extend([acc_reference_df_path, acc_topic_summary_df_path])
1521
+
1522
+ # Outputs for markdown table output
1523
+ unique_table_df_display_table = acc_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
1524
+ acc_markdown_output = unique_table_df_display_table[["General Topic", "Subtopic", "Sentiment", "Number of responses", "Summary", "Group"]].to_markdown(index=False)
1525
 
1526
  print(f"\nWrapper finished processing all segments. Total time: {acc_total_time_taken:.2f}s")
1527
 
tools/prompts.py CHANGED
@@ -85,7 +85,7 @@ Your task is to summarise the above table in markdown format. {summary_format}.
85
 
86
  Summary:"""
87
 
88
- comprehensive_summary_format_prompt = "Return a comprehensive summary that covers all the important topics and themes described in the table"
89
 
90
 
91
  ### Verify exisiting categories prompt
 
85
 
86
  Summary:"""
87
 
88
+ comprehensive_summary_format_prompt = "Return a comprehensive summary that covers all the important topics and themes described in the table. If there are different values in the Group column of the data, compare and contrast differences between the topics and themes from each Group."
89
 
90
 
91
  ### Verify exisiting categories prompt