seanpedrickcase commited on
Commit
92003de
·
1 Parent(s): f2d85f1

Further improved grouping implementation, improved summarisation prompts

Browse files
app.py CHANGED
@@ -347,13 +347,13 @@ with app:
347
  summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, overall_summarisation_input_files]).\
348
  success(load_in_previous_data_files, inputs=[summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
349
  success(sample_reference_table_summaries, inputs=[master_reference_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown], api_name="sample_summaries").\
350
- success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], api_name="summarise_topics")
351
 
352
- latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], scroll_to_output=True)
353
 
354
  # SUMMARISE WHOLE TABLE PAGE
355
  overall_summarise_previous_data_btn.click(load_in_previous_data_files, inputs=[overall_summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
356
- success(overall_summary, inputs=[master_unique_topics_df_state, model_choice, in_api_key, temperature_slide, unique_topics_table_file_name_textbox, summarised_outputs_list, output_folder_state], outputs=[overall_summary_output_files, overall_summarised_output_markdown], scroll_to_output=True, api_name="overall_summary")
357
 
358
  ###
359
  # CONTINUE PREVIOUS TOPIC EXTRACTION PAGE
@@ -361,8 +361,8 @@ with app:
361
 
362
  # If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
363
  continue_previous_data_files_btn.click(
364
- load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches]).\
365
- success(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, reference_data_file_name_textbox])
366
 
367
  ###
368
  # VERIFY TEXT TITLES/DESCRIPTIONS
 
347
  summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, overall_summarisation_input_files]).\
348
  success(load_in_previous_data_files, inputs=[summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
349
  success(sample_reference_table_summaries, inputs=[master_reference_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown], api_name="sample_summaries").\
350
+ success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state, context_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], api_name="summarise_topics")
351
 
352
+ latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state, context_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], scroll_to_output=True)
353
 
354
  # SUMMARISE WHOLE TABLE PAGE
355
  overall_summarise_previous_data_btn.click(load_in_previous_data_files, inputs=[overall_summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
356
+ success(overall_summary, inputs=[master_unique_topics_df_state, model_choice, in_api_key, temperature_slide, unique_topics_table_file_name_textbox, output_folder_state, in_colnames, context_textbox], outputs=[overall_summary_output_files, overall_summarised_output_markdown], scroll_to_output=True, api_name="overall_summary")
357
 
358
  ###
359
  # CONTINUE PREVIOUS TOPIC EXTRACTION PAGE
 
361
 
362
  # If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
363
  continue_previous_data_files_btn.click(
364
+ load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches]).\
365
+ success(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, reference_data_file_name_textbox])
366
 
367
  ###
368
  # VERIFY TEXT TITLES/DESCRIPTIONS
tools/aws_functions.py CHANGED
@@ -3,7 +3,7 @@ import pandas as pd
3
  import boto3
4
  import tempfile
5
  import os
6
- from tools.config import RUN_AWS_FUNCTIONS, AWS_REGION, CONSULTATION_SUMMARY_BUCKET
7
 
8
  PandasDataFrame = Type[pd.DataFrame]
9
 
@@ -12,7 +12,7 @@ bucket_name=""
12
 
13
  if RUN_AWS_FUNCTIONS == "1":
14
  try:
15
- bucket_name = CONSULTATION_SUMMARY_BUCKET
16
  session = boto3.Session() # profile_name="default"
17
  except Exception as e:
18
  print(e)
 
3
  import boto3
4
  import tempfile
5
  import os
6
+ from tools.config import RUN_AWS_FUNCTIONS, AWS_REGION, S3_LOG_BUCKET
7
 
8
  PandasDataFrame = Type[pd.DataFrame]
9
 
 
12
 
13
  if RUN_AWS_FUNCTIONS == "1":
14
  try:
15
+ bucket_name = S3_LOG_BUCKET
16
  session = boto3.Session() # profile_name="default"
17
  except Exception as e:
18
  print(e)
tools/config.py CHANGED
@@ -105,7 +105,7 @@ if AWS_ACCESS_KEY: print(f'AWS_ACCESS_KEY found in environment variables')
105
  AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
106
  if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
107
 
108
- CONSULTATION_SUMMARY_BUCKET = get_or_create_env_var('CONSULTATION_SUMMARY_BUCKET', '')
109
 
110
  # Custom headers e.g. if routing traffic through Cloudfront
111
  # Retrieving or setting CUSTOM_HEADER
@@ -320,7 +320,7 @@ GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', '')
320
 
321
  ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_allow_list.csv
322
 
323
- S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_allow_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
324
 
325
  if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
326
  else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
 
105
  AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
106
  if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
107
 
108
+ S3_LOG_BUCKET = get_or_create_env_var('S3_LOG_BUCKET', '')
109
 
110
  # Custom headers e.g. if routing traffic through Cloudfront
111
  # Retrieving or setting CUSTOM_HEADER
 
320
 
321
  ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_allow_list.csv
322
 
323
+ S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_allow_list.csv # This is a path within the named S3 bucket
324
 
325
  if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
326
  else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
tools/dedup_summaries.py CHANGED
@@ -7,7 +7,7 @@ import gradio as gr
7
  import time
8
  from tqdm import tqdm
9
 
10
- from tools.prompts import summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, system_prompt, summarise_everything_prompt, comprehensive_summary_format_prompt
11
  from tools.llm_funcs import construct_gemini_generative_model, process_requests, ResponseObject, load_model
12
  from tools.helper_functions import create_topic_summary_df_from_reference_table, load_in_data_file, get_basic_response_data, convert_reference_table_to_pivot_table, wrap_text, clean_column_name
13
  from tools.config import OUTPUT_FOLDER, RUN_LOCAL_MODEL, MAX_COMMENT_CHARS, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED
@@ -437,7 +437,8 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
437
  chosen_cols:List[str]=[],
438
  log_output_files:list[str]=[],
439
  summarise_format_radio:str="Return a summary up to two paragraphs long that includes as much detail as possible from the original text",
440
- output_folder:str=OUTPUT_FOLDER,
 
441
  summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
442
  do_summaries:str="Yes",
443
  progress=gr.Progress(track_tqdm=True)):
@@ -566,8 +567,10 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
566
  #print("summary_text:", summary_text)
567
  formatted_summary_prompt = [summarise_topic_descriptions_prompt.format(summaries=summary_text, summary_format=summarise_format_radio)]
568
 
 
 
569
  try:
570
- response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, summarise_topic_descriptions_system_prompt, local_model)
571
  summarised_output = response
572
  summarised_output = re.sub(r'\n{2,}', '\n', summarised_output) # Replace multiple line breaks with a single line break
573
  summarised_output = re.sub(r'^\n{1,}', '', summarised_output) # Remove one or more line breaks at the start
@@ -584,7 +587,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
584
 
585
  # Check if beyond max time allowed for processing and break if necessary
586
  toc = time.perf_counter()
587
- time_taken = tic - toc
588
 
589
  if time_taken > max_time_for_loop:
590
  print("Time taken for loop is greater than maximum time allowed. Exiting and restarting loop")
@@ -594,7 +597,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
594
 
595
  # If all summaries completeed
596
  if latest_summary_completed >= length_all_summaries:
597
- print("At last summary.")
598
 
599
  output_files = list(set(output_files))
600
 
@@ -605,10 +608,13 @@ def overall_summary(topic_summary_df:pd.DataFrame,
605
  in_api_key:str,
606
  temperature:float,
607
  table_file_name:str,
608
- summarised_outputs:list = [],
609
- output_folder:str=OUTPUT_FOLDER,
 
610
  summarise_everything_prompt:str=summarise_everything_prompt,
611
  comprehensive_summary_format_prompt:str=comprehensive_summary_format_prompt,
 
 
612
  do_summaries:str="Yes",
613
  progress=gr.Progress(track_tqdm=True)):
614
  '''
@@ -616,17 +622,35 @@ def overall_summary(topic_summary_df:pd.DataFrame,
616
  '''
617
 
618
  out_metadata = []
619
- local_model = []
620
- length_all_summaries = 1
621
  latest_summary_completed = 0
622
  output_files = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
623
 
624
  model_choice_clean = model_name_map[model_choice]
625
  model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
626
  file_name = re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name).group(1) if re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name) else table_file_name
627
  latest_batch_completed = int(re.search(r'batch_(\d+)_', table_file_name).group(1)) if 'batch_' in table_file_name else ""
628
  batch_size_number = int(re.search(r'size_(\d+)_', table_file_name).group(1)) if 'size_' in table_file_name else ""
629
- in_column_cleaned = re.search(r'col_(.*?)_reference', table_file_name).group(1) if 'col_' in table_file_name else ""
630
 
631
  # Save outputs for each batch. If master file created, label file as master
632
  if latest_batch_completed:
@@ -644,48 +668,75 @@ def overall_summary(topic_summary_df:pd.DataFrame,
644
  local_model, tokenizer = load_model(local_model_type=CHOSEN_LOCAL_MODEL_TYPE, repo_id=LOCAL_REPO_ID, model_filename=LOCAL_MODEL_FILE, model_dir=LOCAL_MODEL_FOLDER)
645
  #print("Local model loaded:", local_model)
646
 
647
- summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
648
- summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")
649
 
650
  if do_summaries == "Yes":
651
- for summary_no in summary_loop:
652
 
653
- summary_text = topic_summary_df.to_markdown(index=False)
 
 
654
 
655
  formatted_summary_prompt = [summarise_everything_prompt.format(topic_summary_table=summary_text, summary_format=comprehensive_summary_format_prompt)]
656
 
 
 
657
  try:
658
- response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, summarise_topic_descriptions_system_prompt, local_model)
659
  summarised_output = response
660
  summarised_output = re.sub(r'\n{2,}', '\n', summarised_output) # Replace multiple line breaks with a single line break
661
  summarised_output = re.sub(r'^\n{1,}', '', summarised_output) # Remove one or more line breaks at the start
662
  summarised_output = summarised_output.strip()
663
  except Exception as e:
664
- print(e)
665
  summarised_output = ""
666
 
667
  summarised_outputs.append(summarised_output)
 
 
668
  out_metadata.extend(metadata)
669
  out_metadata_str = '. '.join(out_metadata)
670
 
671
  latest_summary_completed += 1
672
 
673
- # Check if beyond max time allowed for processing and break if necessary
674
- toc = time.perf_counter()
675
- time_taken = tic - toc
676
 
677
- # Define the output file path for the output
678
- print("batch_file_path_details just before save:", batch_file_path_details)
679
- overall_summary_output_path = output_folder + batch_file_path_details + "_overall_summary_" + model_choice_clean_short + "_temp_" + str(temperature) + ".txt"
680
 
681
- # Write the formatted prompt to the specified file
682
  try:
683
  with open(overall_summary_output_path, "w", encoding='utf-8', errors='replace') as f:
684
  f.write(summarised_output)
685
- output_files.append(overall_summary_output_path)
686
  except Exception as e:
687
  print(f"Error writing prompt to file {overall_summary_output_path}: {e}")
688
 
689
- output_files = list(set(output_files))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
690
 
691
- return output_files, summarised_output
 
7
  import time
8
  from tqdm import tqdm
9
 
10
+ from tools.prompts import summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, system_prompt, summarise_everything_prompt, comprehensive_summary_format_prompt, summarise_everything_system_prompt, comprehensive_summary_format_prompt_by_group
11
  from tools.llm_funcs import construct_gemini_generative_model, process_requests, ResponseObject, load_model
12
  from tools.helper_functions import create_topic_summary_df_from_reference_table, load_in_data_file, get_basic_response_data, convert_reference_table_to_pivot_table, wrap_text, clean_column_name
13
  from tools.config import OUTPUT_FOLDER, RUN_LOCAL_MODEL, MAX_COMMENT_CHARS, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED
 
437
  chosen_cols:List[str]=[],
438
  log_output_files:list[str]=[],
439
  summarise_format_radio:str="Return a summary up to two paragraphs long that includes as much detail as possible from the original text",
440
+ output_folder:str=OUTPUT_FOLDER,
441
+ context_textbox:str="",
442
  summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
443
  do_summaries:str="Yes",
444
  progress=gr.Progress(track_tqdm=True)):
 
567
  #print("summary_text:", summary_text)
568
  formatted_summary_prompt = [summarise_topic_descriptions_prompt.format(summaries=summary_text, summary_format=summarise_format_radio)]
569
 
570
+ formatted_summarise_topic_descriptions_system_prompt = summarise_topic_descriptions_system_prompt.format(column_name=chosen_cols[0],consultation_context=context_textbox)
571
+
572
  try:
573
+ response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, formatted_summarise_topic_descriptions_system_prompt, local_model)
574
  summarised_output = response
575
  summarised_output = re.sub(r'\n{2,}', '\n', summarised_output) # Replace multiple line breaks with a single line break
576
  summarised_output = re.sub(r'^\n{1,}', '', summarised_output) # Remove one or more line breaks at the start
 
587
 
588
  # Check if beyond max time allowed for processing and break if necessary
589
  toc = time.perf_counter()
590
+ time_taken = toc - tic
591
 
592
  if time_taken > max_time_for_loop:
593
  print("Time taken for loop is greater than maximum time allowed. Exiting and restarting loop")
 
597
 
598
  # If all summaries completeed
599
  if latest_summary_completed >= length_all_summaries:
600
+ print("At last summary. Time taken:", time_taken)
601
 
602
  output_files = list(set(output_files))
603
 
 
608
  in_api_key:str,
609
  temperature:float,
610
  table_file_name:str,
611
+ output_folder:str=OUTPUT_FOLDER,
612
+ chosen_cols:List[str]=[],
613
+ context_textbox:str="",
614
  summarise_everything_prompt:str=summarise_everything_prompt,
615
  comprehensive_summary_format_prompt:str=comprehensive_summary_format_prompt,
616
+ comprehensive_summary_format_prompt_by_group:str=comprehensive_summary_format_prompt_by_group,
617
+ summarise_everything_system_prompt:str=summarise_everything_system_prompt,
618
  do_summaries:str="Yes",
619
  progress=gr.Progress(track_tqdm=True)):
620
  '''
 
622
  '''
623
 
624
  out_metadata = []
625
+ local_model = []
 
626
  latest_summary_completed = 0
627
  output_files = []
628
+ txt_summarised_outputs = []
629
+ summarised_outputs = []
630
+
631
+ if "Group" not in topic_summary_df.columns:
632
+ topic_summary_df["Group"] = "All"
633
+
634
+
635
+ topic_summary_df = topic_summary_df.sort_values(by=["Group", "Number of responses"], ascending=[True, False])
636
+
637
+ unique_groups = sorted(topic_summary_df["Group"].unique())
638
+
639
+ print("unique_groups:", unique_groups)
640
+
641
+ length_groups = len(unique_groups)
642
+
643
+ if length_groups > 1:
644
+ comprehensive_summary_format_prompt = comprehensive_summary_format_prompt_by_group
645
+ else:
646
+ comprehensive_summary_format_prompt = comprehensive_summary_format_prompt
647
 
648
  model_choice_clean = model_name_map[model_choice]
649
  model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
650
  file_name = re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name).group(1) if re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name) else table_file_name
651
  latest_batch_completed = int(re.search(r'batch_(\d+)_', table_file_name).group(1)) if 'batch_' in table_file_name else ""
652
  batch_size_number = int(re.search(r'size_(\d+)_', table_file_name).group(1)) if 'size_' in table_file_name else ""
653
+ in_column_cleaned = re.search(r'col_(.*?)_unique', table_file_name).group(1) if 'col_' in table_file_name else ""
654
 
655
  # Save outputs for each batch. If master file created, label file as master
656
  if latest_batch_completed:
 
668
  local_model, tokenizer = load_model(local_model_type=CHOSEN_LOCAL_MODEL_TYPE, repo_id=LOCAL_REPO_ID, model_filename=LOCAL_MODEL_FILE, model_dir=LOCAL_MODEL_FOLDER)
669
  #print("Local model loaded:", local_model)
670
 
671
+ summary_loop = tqdm(unique_groups, desc="Creating summaries for groups", unit="groups")
 
672
 
673
  if do_summaries == "Yes":
674
+ for summary_group in summary_loop:
675
 
676
+ print("Creating summary for group:", summary_group)
677
+
678
+ summary_text = topic_summary_df.loc[topic_summary_df["Group"]==summary_group].to_markdown(index=False)
679
 
680
  formatted_summary_prompt = [summarise_everything_prompt.format(topic_summary_table=summary_text, summary_format=comprehensive_summary_format_prompt)]
681
 
682
+ formatted_summarise_everything_system_prompt = summarise_everything_system_prompt.format(column_name=chosen_cols[0],consultation_context=context_textbox)
683
+
684
  try:
685
+ response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, formatted_summarise_everything_system_prompt, local_model)
686
  summarised_output = response
687
  summarised_output = re.sub(r'\n{2,}', '\n', summarised_output) # Replace multiple line breaks with a single line break
688
  summarised_output = re.sub(r'^\n{1,}', '', summarised_output) # Remove one or more line breaks at the start
689
  summarised_output = summarised_output.strip()
690
  except Exception as e:
691
+ print("Cannot create overall summary for group:", summary_group, "due to:", e)
692
  summarised_output = ""
693
 
694
  summarised_outputs.append(summarised_output)
695
+ txt_summarised_outputs.append(f"""Group name: {summary_group}\n""" + summarised_output)
696
+
697
  out_metadata.extend(metadata)
698
  out_metadata_str = '. '.join(out_metadata)
699
 
700
  latest_summary_completed += 1
701
 
702
+ summary_group_short = clean_column_name(summary_group)
 
 
703
 
704
+ # Write outputs
705
+ overall_summary_output_path = output_folder + batch_file_path_details + "_overall_summary_grp" + summary_group_short + "_" + model_choice_clean_short + "_temp_" + str(temperature) + ".txt"
 
706
 
707
+ # Write single group outputs
708
  try:
709
  with open(overall_summary_output_path, "w", encoding='utf-8', errors='replace') as f:
710
  f.write(summarised_output)
711
+ # output_files.append(overall_summary_output_path)
712
  except Exception as e:
713
  print(f"Error writing prompt to file {overall_summary_output_path}: {e}")
714
 
715
+ # Write overall outputs to csv
716
+ overall_summary_output_csv_path = output_folder + batch_file_path_details + "_overall_summary_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
717
+ summarised_outputs_df = pd.DataFrame(data={"Group":unique_groups, "Summary":summarised_outputs})
718
+ summarised_outputs_df.to_csv(overall_summary_output_csv_path, index=None)
719
+ output_files.append(overall_summary_output_csv_path)
720
+
721
+ markdown_output_table = summarised_outputs_df.to_markdown(index=False)
722
+
723
+ # Text output file
724
+ summarised_outputs_join = "\n".join(txt_summarised_outputs)
725
+ overall_summary_output_txt_path = output_folder + batch_file_path_details + "_overall_summary_" + model_choice_clean_short + "_temp_" + str(temperature) + ".txt"
726
+
727
+ try:
728
+ with open(overall_summary_output_txt_path, "w", encoding='utf-8', errors='replace') as f:
729
+ f.write(summarised_outputs_join)
730
+ output_files.append(overall_summary_output_txt_path)
731
+ except Exception as e:
732
+ print(f"Error writing prompt to file {overall_summary_output_txt_path}: {e}")
733
+
734
+ output_files = list(set(output_files))
735
+
736
+ # Check if beyond max time allowed for processing and break if necessary
737
+ toc = time.perf_counter()
738
+ time_taken = toc - tic
739
+
740
+ print("All group summaries created. Time taken:", time_taken)
741
 
742
+ return output_files, markdown_output_table
tools/llm_api_call.py CHANGED
@@ -186,7 +186,7 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
186
 
187
  return simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_data
188
 
189
- def replace_punctuation_with_underscore(input_string):
190
  # Create a translation table where each punctuation character maps to '_'
191
  translation_table = str.maketrans(string.punctuation, '_' * len(string.punctuation))
192
 
 
186
 
187
  return simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_data
188
 
189
+ def replace_punctuation_with_underscore(input_string:str):
190
  # Create a translation table where each punctuation character maps to '_'
191
  translation_table = str.maketrans(string.punctuation, '_' * len(string.punctuation))
192
 
tools/prompts.py CHANGED
@@ -1,4 +1,6 @@
1
- system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset that is full of open text responses called '{column_name}'. The context of this analysis is '{consultation_context}'."""
 
 
2
 
3
  initial_table_prompt = """The open text data is shown in the following table that contains two columns, Reference and Response. Response table:
4
  {response_table}
@@ -48,6 +50,10 @@ New table:"""
48
 
49
  # Return only one table in markdown format containing all relevant topics. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
50
 
 
 
 
 
51
  summarise_topic_descriptions_system_prompt = system_prompt
52
 
53
  summarise_topic_descriptions_prompt = """Below is a table with number of paragraphs related to the data from the open text column:
@@ -62,36 +68,33 @@ single_para_summary_format_prompt = "Return a concise summary up to one paragrap
62
 
63
  two_para_summary_format_prompt = "Return a summary up to two paragraphs long that includes as much detail as possible from the original text"
64
 
 
 
 
65
 
66
- ## The following didn't work well in testing and so is not currently used
67
 
68
- create_general_topics_system_prompt = system_prompt
69
 
70
- create_general_topics_prompt = """Subtopics known to be relevant to this dataset are shown in the following Topics table:
71
- {topics}
72
 
73
- Your task is to create a General Topic name for each Subtopic. The new Topics table should have the columns 'General Topic' and 'Subtopic' only. Write a 'General Topic' text label relevant to the Subtopic next to it in the new table. The text label should describe the general theme of the Subtopic. Do not add any other text, thoughts, or notes to your response.
74
 
75
- New Topics table:"""
76
 
 
77
 
78
- ### Summarise everything prompt
79
 
80
- summarise_everything_prompt = """Below is a table that gives an overview of the main topics from a dataset of open text responses along with a description of each topic, and the number of responses that mentioned each topic:
81
 
82
- '{topic_summary_table}'
83
 
84
- Your task is to summarise the above table in markdown format. {summary_format}. Return only the summary in markdown format and no other text.
85
 
86
- Summary:"""
 
 
87
 
88
- comprehensive_summary_format_prompt = "Return a comprehensive summary that covers all the important topics and themes described in the table. If there are different values in the Group column of the data, compare and contrast differences between the topics and themes from each Group."
89
-
90
-
91
- ### Verify exisiting categories prompt
92
  verify_titles_system_prompt = system_prompt
93
 
94
-
95
  verify_titles_prompt = """Response numbers alongside the Response text and assigned descriptions are shown in the table below:
96
  {response_table}
97
 
@@ -107,6 +110,16 @@ Do not add any other text to your response.
107
  Output markdown table:"""
108
 
109
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  # example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
112
  # You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
 
1
+ generic_system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset."""
2
+
3
+ system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset called '{column_name}'. The context of this analysis is '{consultation_context}'."""
4
 
5
  initial_table_prompt = """The open text data is shown in the following table that contains two columns, Reference and Response. Response table:
6
  {response_table}
 
50
 
51
  # Return only one table in markdown format containing all relevant topics. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
52
 
53
+ ###
54
+ # SUMMARISE TOPICS PROMPT
55
+ ###
56
+
57
  summarise_topic_descriptions_system_prompt = system_prompt
58
 
59
  summarise_topic_descriptions_prompt = """Below is a table with number of paragraphs related to the data from the open text column:
 
68
 
69
  two_para_summary_format_prompt = "Return a summary up to two paragraphs long that includes as much detail as possible from the original text"
70
 
71
+ ###
72
+ # OVERALL SUMMARY PROMPTS
73
+ ###
74
 
75
+ summarise_everything_system_prompt = generic_system_prompt
76
 
77
+ summarise_everything_prompt = """Below is a table that gives an overview of the main topics from a dataset of open text responses along with a description of each topic, and the number of responses that mentioned each topic:
78
 
79
+ '{topic_summary_table}'
 
80
 
81
+ Your task is to summarise the above table. {summary_format}. Return only the summary and no other text.
82
 
83
+ Summary:"""
84
 
85
+ comprehensive_summary_format_prompt = "Return a comprehensive summary that covers all the important topics and themes described in the table. Structure the summary with General Topics as headings, with significant Subtopics described in bullet points below them in order of relative significance. Do not explicitly mention the Sentiment, Number of responses, or Group values. Do not use the words 'General Topic' or 'Subtopic' directly in the summary."
86
 
87
+ comprehensive_summary_format_prompt_by_group = "Return a comprehensive summary that covers all the important topics and themes described in the table. Structure the summary with General Topics as headings, with significant Subtopics described in bullet points below them in order of relative significance. Do not explicitly mention the Sentiment, Number of responses, or Group values. Do not use the words 'General Topic' or 'Subtopic' directly in the summary. Compare and contrast differences between the topics and themes from each Group."
88
 
 
89
 
 
90
 
 
91
 
92
+ ###
93
+ # VERIFY EXISTING DESCRIPTIONS/TITLES
94
+ ###
95
 
 
 
 
 
96
  verify_titles_system_prompt = system_prompt
97
 
 
98
  verify_titles_prompt = """Response numbers alongside the Response text and assigned descriptions are shown in the table below:
99
  {response_table}
100
 
 
110
  Output markdown table:"""
111
 
112
 
113
+ ## The following didn't work well in testing and so is not currently used
114
+
115
+ create_general_topics_system_prompt = system_prompt
116
+
117
+ create_general_topics_prompt = """Subtopics known to be relevant to this dataset are shown in the following Topics table:
118
+ {topics}
119
+
120
+ Your task is to create a General Topic name for each Subtopic. The new Topics table should have the columns 'General Topic' and 'Subtopic' only. Write a 'General Topic' text label relevant to the Subtopic next to it in the new table. The text label should describe the general theme of the Subtopic. Do not add any other text, thoughts, or notes to your response.
121
+
122
+ New Topics table:"""
123
 
124
  # example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
125
  # You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n