Spaces:

seanpedrickcase
/

llm_topic_modelling

Running on Zero

App Files Files Community

seanpedrickcase commited on Jun 25

Commit

92003de

1 Parent(s): f2d85f1

Further improved grouping implementation, improved summarisation prompts

Browse files

Files changed (6) hide show

app.py +5 -5
tools/aws_functions.py +2 -2
tools/config.py +2 -2
tools/dedup_summaries.py +77 -26
tools/llm_api_call.py +1 -1
tools/prompts.py +30 -17

app.py CHANGED Viewed

@@ -347,13 +347,13 @@ with app:
     summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, overall_summarisation_input_files]).\
         success(load_in_previous_data_files, inputs=[summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
             success(sample_reference_table_summaries, inputs=[master_reference_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown], api_name="sample_summaries").\
-                success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], api_name="summarise_topics")
-    latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], scroll_to_output=True)
     # SUMMARISE WHOLE TABLE PAGE
     overall_summarise_previous_data_btn.click(load_in_previous_data_files, inputs=[overall_summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
-            success(overall_summary, inputs=[master_unique_topics_df_state, model_choice, in_api_key, temperature_slide, unique_topics_table_file_name_textbox, summarised_outputs_list, output_folder_state], outputs=[overall_summary_output_files, overall_summarised_output_markdown], scroll_to_output=True, api_name="overall_summary")
     ###
     # CONTINUE PREVIOUS TOPIC EXTRACTION PAGE
@@ -361,8 +361,8 @@ with app:
     # If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
     continue_previous_data_files_btn.click(
-            load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches]).\
-            success(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, reference_data_file_name_textbox])
     ###
     # VERIFY TEXT TITLES/DESCRIPTIONS

     summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, overall_summarisation_input_files]).\
         success(load_in_previous_data_files, inputs=[summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
             success(sample_reference_table_summaries, inputs=[master_reference_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown], api_name="sample_summaries").\
+                success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state, context_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], api_name="summarise_topics")
+    latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state, context_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], scroll_to_output=True)
     # SUMMARISE WHOLE TABLE PAGE
     overall_summarise_previous_data_btn.click(load_in_previous_data_files, inputs=[overall_summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
+            success(overall_summary, inputs=[master_unique_topics_df_state, model_choice, in_api_key, temperature_slide, unique_topics_table_file_name_textbox, output_folder_state, in_colnames, context_textbox], outputs=[overall_summary_output_files, overall_summarised_output_markdown], scroll_to_output=True, api_name="overall_summary")
     ###
     # CONTINUE PREVIOUS TOPIC EXTRACTION PAGE
     # If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
     continue_previous_data_files_btn.click(
+        load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches]).\
+        success(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, reference_data_file_name_textbox])
     ###
     # VERIFY TEXT TITLES/DESCRIPTIONS

tools/aws_functions.py CHANGED Viewed

@@ -3,7 +3,7 @@ import pandas as pd
 import boto3
 import tempfile
 import os
-from tools.config import RUN_AWS_FUNCTIONS, AWS_REGION, CONSULTATION_SUMMARY_BUCKET
 PandasDataFrame = Type[pd.DataFrame]
@@ -12,7 +12,7 @@ bucket_name=""
 if RUN_AWS_FUNCTIONS == "1":
     try:
-        bucket_name = CONSULTATION_SUMMARY_BUCKET
         session = boto3.Session() # profile_name="default"
     except Exception as e:
         print(e)

 import boto3
 import tempfile
 import os
+from tools.config import RUN_AWS_FUNCTIONS, AWS_REGION, S3_LOG_BUCKET
 PandasDataFrame = Type[pd.DataFrame]
 if RUN_AWS_FUNCTIONS == "1":
     try:
+        bucket_name = S3_LOG_BUCKET
         session = boto3.Session() # profile_name="default"
     except Exception as e:
         print(e)

tools/config.py CHANGED Viewed

@@ -105,7 +105,7 @@ if AWS_ACCESS_KEY: print(f'AWS_ACCESS_KEY found in environment variables')
 AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
 if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
-CONSULTATION_SUMMARY_BUCKET = get_or_create_env_var('CONSULTATION_SUMMARY_BUCKET', '')
 # Custom headers e.g. if routing traffic through Cloudfront
 # Retrieving or setting CUSTOM_HEADER
@@ -320,7 +320,7 @@ GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', '')
 ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_allow_list.csv
-S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_allow_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
 if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
 else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'

 AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
 if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
+S3_LOG_BUCKET = get_or_create_env_var('S3_LOG_BUCKET', '')
 # Custom headers e.g. if routing traffic through Cloudfront
 # Retrieving or setting CUSTOM_HEADER
 ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_allow_list.csv
+S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_allow_list.csv # This is a path within the named S3 bucket
 if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
 else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'

tools/dedup_summaries.py CHANGED Viewed

@@ -7,7 +7,7 @@ import gradio as gr
 import time
 from tqdm import tqdm
-from tools.prompts import summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, system_prompt, summarise_everything_prompt, comprehensive_summary_format_prompt
 from tools.llm_funcs import construct_gemini_generative_model, process_requests, ResponseObject, load_model
 from tools.helper_functions import create_topic_summary_df_from_reference_table, load_in_data_file, get_basic_response_data, convert_reference_table_to_pivot_table, wrap_text, clean_column_name
 from tools.config import OUTPUT_FOLDER, RUN_LOCAL_MODEL, MAX_COMMENT_CHARS, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED
@@ -437,7 +437,8 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
                             chosen_cols:List[str]=[],
                             log_output_files:list[str]=[],
                             summarise_format_radio:str="Return a summary up to two paragraphs long that includes as much detail as possible from the original text",
-                            output_folder:str=OUTPUT_FOLDER,
                             summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
                             do_summaries:str="Yes",
                             progress=gr.Progress(track_tqdm=True)):
@@ -566,8 +567,10 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
             #print("summary_text:", summary_text)
             formatted_summary_prompt = [summarise_topic_descriptions_prompt.format(summaries=summary_text, summary_format=summarise_format_radio)]
             try:
-                response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, summarise_topic_descriptions_system_prompt, local_model)
                 summarised_output = response
                 summarised_output = re.sub(r'\n{2,}', '\n', summarised_output)  # Replace multiple line breaks with a single line break
                 summarised_output = re.sub(r'^\n{1,}', '', summarised_output)  # Remove one or more line breaks at the start
@@ -584,7 +587,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
             # Check if beyond max time allowed for processing and break if necessary
             toc = time.perf_counter()
-            time_taken = tic - toc
             if time_taken > max_time_for_loop:
                 print("Time taken for loop is greater than maximum time allowed. Exiting and restarting loop")
@@ -594,7 +597,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
     # If all summaries completeed
     if latest_summary_completed >= length_all_summaries:
-        print("At last summary.")
     output_files = list(set(output_files))
@@ -605,10 +608,13 @@ def overall_summary(topic_summary_df:pd.DataFrame,
                     in_api_key:str,
                     temperature:float,
                     table_file_name:str,
-                    summarised_outputs:list = [],
-                    output_folder:str=OUTPUT_FOLDER,
                     summarise_everything_prompt:str=summarise_everything_prompt,
                     comprehensive_summary_format_prompt:str=comprehensive_summary_format_prompt,
                     do_summaries:str="Yes",
                     progress=gr.Progress(track_tqdm=True)):
     '''
@@ -616,17 +622,35 @@ def overall_summary(topic_summary_df:pd.DataFrame,
     '''
     out_metadata = []
-    local_model = []
-    length_all_summaries = 1
     latest_summary_completed = 0
     output_files = []
     model_choice_clean = model_name_map[model_choice]
     model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
     file_name = re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name).group(1) if re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name) else table_file_name
     latest_batch_completed = int(re.search(r'batch_(\d+)_', table_file_name).group(1)) if 'batch_' in table_file_name else ""
     batch_size_number = int(re.search(r'size_(\d+)_', table_file_name).group(1)) if 'size_' in table_file_name else ""
-    in_column_cleaned = re.search(r'col_(.*?)_reference', table_file_name).group(1) if 'col_' in table_file_name else ""
     # Save outputs for each batch. If master file created, label file as master
     if latest_batch_completed:
@@ -644,48 +668,75 @@ def overall_summary(topic_summary_df:pd.DataFrame,
                 local_model, tokenizer = load_model(local_model_type=CHOSEN_LOCAL_MODEL_TYPE, repo_id=LOCAL_REPO_ID, model_filename=LOCAL_MODEL_FILE, model_dir=LOCAL_MODEL_FOLDER)
                 #print("Local model loaded:", local_model)
-    summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
-    summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")
     if do_summaries == "Yes":
-        for summary_no in summary_loop:
-            summary_text = topic_summary_df.to_markdown(index=False)
             formatted_summary_prompt = [summarise_everything_prompt.format(topic_summary_table=summary_text, summary_format=comprehensive_summary_format_prompt)]
             try:
-                response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, summarise_topic_descriptions_system_prompt, local_model)
                 summarised_output = response
                 summarised_output = re.sub(r'\n{2,}', '\n', summarised_output)  # Replace multiple line breaks with a single line break
                 summarised_output = re.sub(r'^\n{1,}', '', summarised_output)  # Remove one or more line breaks at the start
                 summarised_output = summarised_output.strip()
             except Exception as e:
-                print(e)
                 summarised_output = ""
             summarised_outputs.append(summarised_output)
             out_metadata.extend(metadata)
             out_metadata_str = '. '.join(out_metadata)
             latest_summary_completed += 1
-            # Check if beyond max time allowed for processing and break if necessary
-            toc = time.perf_counter()
-            time_taken = tic - toc
-            # Define the output file path for the output
-            print("batch_file_path_details just before save:", batch_file_path_details)
-            overall_summary_output_path = output_folder + batch_file_path_details + "_overall_summary_" + model_choice_clean_short + "_temp_" + str(temperature) + ".txt"
-            # Write the formatted prompt to the specified file
             try:
                 with open(overall_summary_output_path, "w", encoding='utf-8', errors='replace') as f:
                     f.write(summarised_output)
-                output_files.append(overall_summary_output_path)
             except Exception as e:
                 print(f"Error writing prompt to file {overall_summary_output_path}: {e}")
-            output_files = list(set(output_files))
-    return output_files, summarised_output

 import time
 from tqdm import tqdm
+from tools.prompts import summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, system_prompt, summarise_everything_prompt, comprehensive_summary_format_prompt, summarise_everything_system_prompt, comprehensive_summary_format_prompt_by_group
 from tools.llm_funcs import construct_gemini_generative_model, process_requests, ResponseObject, load_model
 from tools.helper_functions import create_topic_summary_df_from_reference_table, load_in_data_file, get_basic_response_data, convert_reference_table_to_pivot_table, wrap_text, clean_column_name
 from tools.config import OUTPUT_FOLDER, RUN_LOCAL_MODEL, MAX_COMMENT_CHARS, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED
                             chosen_cols:List[str]=[],
                             log_output_files:list[str]=[],
                             summarise_format_radio:str="Return a summary up to two paragraphs long that includes as much detail as possible from the original text",
+                            output_folder:str=OUTPUT_FOLDER,
+                            context_textbox:str="",
                             summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
                             do_summaries:str="Yes",
                             progress=gr.Progress(track_tqdm=True)):
             #print("summary_text:", summary_text)
             formatted_summary_prompt = [summarise_topic_descriptions_prompt.format(summaries=summary_text, summary_format=summarise_format_radio)]
+            formatted_summarise_topic_descriptions_system_prompt = summarise_topic_descriptions_system_prompt.format(column_name=chosen_cols[0],consultation_context=context_textbox)
             try:
+                response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, formatted_summarise_topic_descriptions_system_prompt, local_model)
                 summarised_output = response
                 summarised_output = re.sub(r'\n{2,}', '\n', summarised_output)  # Replace multiple line breaks with a single line break
                 summarised_output = re.sub(r'^\n{1,}', '', summarised_output)  # Remove one or more line breaks at the start
             # Check if beyond max time allowed for processing and break if necessary
             toc = time.perf_counter()
+            time_taken = toc - tic
             if time_taken > max_time_for_loop:
                 print("Time taken for loop is greater than maximum time allowed. Exiting and restarting loop")
     # If all summaries completeed
     if latest_summary_completed >= length_all_summaries:
+        print("At last summary. Time taken:", time_taken)
     output_files = list(set(output_files))
                     in_api_key:str,
                     temperature:float,
                     table_file_name:str,
+                    output_folder:str=OUTPUT_FOLDER,
+                    chosen_cols:List[str]=[],
+                    context_textbox:str="",
                     summarise_everything_prompt:str=summarise_everything_prompt,
                     comprehensive_summary_format_prompt:str=comprehensive_summary_format_prompt,
+                    comprehensive_summary_format_prompt_by_group:str=comprehensive_summary_format_prompt_by_group,
+                    summarise_everything_system_prompt:str=summarise_everything_system_prompt,
                     do_summaries:str="Yes",
                     progress=gr.Progress(track_tqdm=True)):
     '''
     '''
     out_metadata = []
+    local_model = []
     latest_summary_completed = 0
     output_files = []
+    txt_summarised_outputs = []
+    summarised_outputs = []
+    if "Group" not in topic_summary_df.columns:
+        topic_summary_df["Group"] = "All"
+    topic_summary_df = topic_summary_df.sort_values(by=["Group", "Number of responses"], ascending=[True, False])
+    unique_groups = sorted(topic_summary_df["Group"].unique())
+    print("unique_groups:", unique_groups)
+    length_groups = len(unique_groups)
+    if length_groups > 1:
+        comprehensive_summary_format_prompt = comprehensive_summary_format_prompt_by_group
+    else:
+        comprehensive_summary_format_prompt = comprehensive_summary_format_prompt
     model_choice_clean = model_name_map[model_choice]
     model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
     file_name = re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name).group(1) if re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name) else table_file_name
     latest_batch_completed = int(re.search(r'batch_(\d+)_', table_file_name).group(1)) if 'batch_' in table_file_name else ""
     batch_size_number = int(re.search(r'size_(\d+)_', table_file_name).group(1)) if 'size_' in table_file_name else ""
+    in_column_cleaned = re.search(r'col_(.*?)_unique', table_file_name).group(1) if 'col_' in table_file_name else ""
     # Save outputs for each batch. If master file created, label file as master
     if latest_batch_completed:
                 local_model, tokenizer = load_model(local_model_type=CHOSEN_LOCAL_MODEL_TYPE, repo_id=LOCAL_REPO_ID, model_filename=LOCAL_MODEL_FILE, model_dir=LOCAL_MODEL_FOLDER)
                 #print("Local model loaded:", local_model)
+    summary_loop = tqdm(unique_groups, desc="Creating summaries for groups", unit="groups")
     if do_summaries == "Yes":
+        for summary_group in summary_loop:
+            print("Creating summary for group:", summary_group)
+            summary_text = topic_summary_df.loc[topic_summary_df["Group"]==summary_group].to_markdown(index=False)
             formatted_summary_prompt = [summarise_everything_prompt.format(topic_summary_table=summary_text, summary_format=comprehensive_summary_format_prompt)]
+            formatted_summarise_everything_system_prompt = summarise_everything_system_prompt.format(column_name=chosen_cols[0],consultation_context=context_textbox)
             try:
+                response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, formatted_summarise_everything_system_prompt, local_model)
                 summarised_output = response
                 summarised_output = re.sub(r'\n{2,}', '\n', summarised_output)  # Replace multiple line breaks with a single line break
                 summarised_output = re.sub(r'^\n{1,}', '', summarised_output)  # Remove one or more line breaks at the start
                 summarised_output = summarised_output.strip()
             except Exception as e:
+                print("Cannot create overall summary for group:", summary_group, "due to:", e)
                 summarised_output = ""
             summarised_outputs.append(summarised_output)
+            txt_summarised_outputs.append(f"""Group name: {summary_group}\n""" + summarised_output)
             out_metadata.extend(metadata)
             out_metadata_str = '. '.join(out_metadata)
             latest_summary_completed += 1
+            summary_group_short = clean_column_name(summary_group)
+            # Write outputs
+            overall_summary_output_path = output_folder + batch_file_path_details + "_overall_summary_grp" + summary_group_short + "_" + model_choice_clean_short + "_temp_" + str(temperature) + ".txt"
+            # Write single group outputs
             try:
                 with open(overall_summary_output_path, "w", encoding='utf-8', errors='replace') as f:
                     f.write(summarised_output)
+                # output_files.append(overall_summary_output_path)
             except Exception as e:
                 print(f"Error writing prompt to file {overall_summary_output_path}: {e}")
+        # Write overall outputs to csv
+        overall_summary_output_csv_path = output_folder + batch_file_path_details + "_overall_summary_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
+        summarised_outputs_df = pd.DataFrame(data={"Group":unique_groups, "Summary":summarised_outputs})
+        summarised_outputs_df.to_csv(overall_summary_output_csv_path, index=None)
+        output_files.append(overall_summary_output_csv_path)
+        markdown_output_table = summarised_outputs_df.to_markdown(index=False)
+        # Text output file
+        summarised_outputs_join = "\n".join(txt_summarised_outputs)
+        overall_summary_output_txt_path = output_folder + batch_file_path_details + "_overall_summary_" + model_choice_clean_short + "_temp_" + str(temperature) + ".txt"
+        try:
+            with open(overall_summary_output_txt_path, "w", encoding='utf-8', errors='replace') as f:
+                f.write(summarised_outputs_join)
+            output_files.append(overall_summary_output_txt_path)
+        except Exception as e:
+            print(f"Error writing prompt to file {overall_summary_output_txt_path}: {e}")
+        output_files = list(set(output_files))
+        # Check if beyond max time allowed for processing and break if necessary
+        toc = time.perf_counter()
+        time_taken = toc - tic
+        print("All group summaries created. Time taken:", time_taken)
+    return output_files, markdown_output_table

tools/llm_api_call.py CHANGED Viewed

@@ -186,7 +186,7 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
     return simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_data
-def replace_punctuation_with_underscore(input_string):
     # Create a translation table where each punctuation character maps to '_'
     translation_table = str.maketrans(string.punctuation, '_' * len(string.punctuation))

     return simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_data
+def replace_punctuation_with_underscore(input_string:str):
     # Create a translation table where each punctuation character maps to '_'
     translation_table = str.maketrans(string.punctuation, '_' * len(string.punctuation))

tools/prompts.py CHANGED Viewed

@@ -1,4 +1,6 @@
-system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset that is full of open text responses called '{column_name}'. The context of this analysis is '{consultation_context}'."""
 initial_table_prompt = """The open text data is shown in the following table that contains two columns, Reference and Response. Response table:
 {response_table}
@@ -48,6 +50,10 @@ New table:"""
 # Return only one table in markdown format containing all relevant topics. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
 summarise_topic_descriptions_system_prompt = system_prompt
 summarise_topic_descriptions_prompt = """Below is a table with number of paragraphs related to the data from the open text column:
@@ -62,36 +68,33 @@ single_para_summary_format_prompt = "Return a concise summary up to one paragrap
 two_para_summary_format_prompt = "Return a summary up to two paragraphs long that includes as much detail as possible from the original text"
-## The following didn't work well in testing and so is not currently used
-create_general_topics_system_prompt = system_prompt
-create_general_topics_prompt = """Subtopics known to be relevant to this dataset are shown in the following Topics table:
-{topics}
-Your task is to create a General Topic name for each Subtopic. The new Topics table should have the columns 'General Topic' and 'Subtopic' only. Write a 'General Topic' text label relevant to the Subtopic next to it in the new table. The text label should describe the general theme of the Subtopic. Do not add any other text, thoughts, or notes to your response.
-New Topics table:"""
-### Summarise everything prompt
-summarise_everything_prompt = """Below is a table that gives an overview of the main topics from a dataset of open text responses along with a description of each topic, and the number of responses that mentioned each topic:
-'{topic_summary_table}'
-Your task is to summarise the above table in markdown format. {summary_format}. Return only the summary in markdown format and no other text.
-Summary:"""
-comprehensive_summary_format_prompt = "Return a comprehensive summary that covers all the important topics and themes described in the table. If there are different values in the Group column of the data, compare and contrast differences between the topics and themes from each Group."
-### Verify exisiting categories prompt
 verify_titles_system_prompt = system_prompt
 verify_titles_prompt = """Response numbers alongside the Response text and assigned descriptions are shown in the table below:
 {response_table}
@@ -107,6 +110,16 @@ Do not add any other text to your response.
 Output markdown table:"""
 # example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
 # You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n

+generic_system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset."""
+system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset called '{column_name}'. The context of this analysis is '{consultation_context}'."""
 initial_table_prompt = """The open text data is shown in the following table that contains two columns, Reference and Response. Response table:
 {response_table}
 # Return only one table in markdown format containing all relevant topics. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
+###
+# SUMMARISE TOPICS PROMPT
+###
 summarise_topic_descriptions_system_prompt = system_prompt
 summarise_topic_descriptions_prompt = """Below is a table with number of paragraphs related to the data from the open text column:
 two_para_summary_format_prompt = "Return a summary up to two paragraphs long that includes as much detail as possible from the original text"
+###
+# OVERALL SUMMARY PROMPTS
+###
+summarise_everything_system_prompt = generic_system_prompt
+summarise_everything_prompt = """Below is a table that gives an overview of the main topics from a dataset of open text responses along with a description of each topic, and the number of responses that mentioned each topic:
+'{topic_summary_table}'
+Your task is to summarise the above table. {summary_format}. Return only the summary and no other text.
+Summary:"""
+comprehensive_summary_format_prompt = "Return a comprehensive summary that covers all the important topics and themes described in the table. Structure the summary with General Topics as headings, with significant Subtopics described in bullet points below them in order of relative significance. Do not explicitly mention the Sentiment, Number of responses, or Group values. Do not use the words 'General Topic' or 'Subtopic' directly in the summary."
+comprehensive_summary_format_prompt_by_group = "Return a comprehensive summary that covers all the important topics and themes described in the table. Structure the summary with General Topics as headings, with significant Subtopics described in bullet points below them in order of relative significance. Do not explicitly mention the Sentiment, Number of responses, or Group values. Do not use the words 'General Topic' or 'Subtopic' directly in the summary. Compare and contrast differences between the topics and themes from each Group."
+###
+# VERIFY EXISTING DESCRIPTIONS/TITLES
+###
 verify_titles_system_prompt = system_prompt
 verify_titles_prompt = """Response numbers alongside the Response text and assigned descriptions are shown in the table below:
 {response_table}
 Output markdown table:"""
+## The following didn't work well in testing and so is not currently used
+create_general_topics_system_prompt = system_prompt
+create_general_topics_prompt = """Subtopics known to be relevant to this dataset are shown in the following Topics table:
+{topics}
+Your task is to create a General Topic name for each Subtopic. The new Topics table should have the columns 'General Topic' and 'Subtopic' only. Write a 'General Topic' text label relevant to the Subtopic next to it in the new table. The text label should describe the general theme of the Subtopic. Do not add any other text, thoughts, or notes to your response.
+New Topics table:"""
 # example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
 # You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n