Spaces:

seanpedrickcase
/

llm_topic_modelling

Running on Zero

App Files Files Community

seanpedrickcase commited on Jun 25

Commit

f2d85f1

1 Parent(s): 125e31b

Improved on implementation of group-based analysis. Now should be possible all the way to summarisation

Browse files

Files changed (5) hide show

app.py +13 -38
tools/dedup_summaries.py +41 -53
tools/helper_functions.py +12 -11
tools/llm_api_call.py +76 -38
tools/prompts.py +1 -1

app.py CHANGED Viewed

@@ -80,23 +80,18 @@ with app:
     # UI LAYOUT
     ###
-    gr.Markdown(
-    """# Large language model topic modelling
     Extract topics and summarise outputs using Large Language Models (LLMs, a Gemma model if local, Gemini Flash/Pro, or Claude 3 through AWS Bedrock if running on AWS). The app will query the LLM with batches of responses to produce summary tables, which are then compared iteratively to output a table with the general topics, subtopics, topic sentiment, and relevant text rows related to them. The prompts are designed for topic modelling public consultations, but they can be adapted to different contexts (see the LLM settings tab to modify).
     Instructions on use can be found in the README.md file. Try it out with this [dummy development consultation dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation), which you can also try with [zero-shot topics](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation/blob/main/example_zero_shot.csv), or this [dummy case notes dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_case_notes).
-    You can use an AWS Bedrock model (Claude 3, paid), or Gemini (a free API, but with strict limits for the Pro model). The use of Gemini models requires an API key. To set up your own Gemini API key, go [here](https://aistudio.google.com/app/u/1/plan_information).
     NOTE: that **API calls to Gemini are not considered secure**, so please only submit redacted, non-sensitive tabular files to this source. Also, large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
     with gr.Tab(label="Extract topics"):
-        gr.Markdown(
-        """
-        ### Choose a tabular data file (xlsx or csv) of open text to extract topics from.
-        """
-        )
         with gr.Row():
             model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
             in_api_key = gr.Textbox(value = GEMINI_API_KEY, label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
@@ -138,10 +133,7 @@ with app:
             s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
     with gr.Tab(label="Modify, deduplicate, and summarise topic outputs"):
-        gr.Markdown(
-        """
-        Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to modify topics, deduplicate topics, or summarise the outputs. If you want pivot table outputs, please load in the original data file along with the selected open text column on the first tab before deduplicating or summarising.
-        """)
         with gr.Accordion("Modify existing topics", open = False):
             modification_input_files = gr.File(height=file_input_height, label="Upload files to modify topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
@@ -184,19 +176,13 @@ with app:
         overall_summarised_output_markdown = gr.Markdown(value="### Overall summary will appear here", show_copy_button=True)
     with gr.Tab(label="Topic table viewer"):
-        gr.Markdown(
-        """
-        ### View a 'unique_topic_table' csv file in markdown format.
-        """)
         in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
         view_table_markdown = gr.Markdown(value = "", label="View table", show_copy_button=True)
     with gr.Tab(label="Continue unfinished topic extraction"):
-        gr.Markdown(
-        """
-        ### Load in output files from a previous topic extraction process and continue topic extraction with new data.
-        """)
         with gr.Accordion("Upload reference data file and unique data files", open = True):
             in_previous_data_files = gr.File(height=file_input_height, label="Choose output csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
@@ -204,11 +190,7 @@ with app:
             continue_previous_data_files_btn = gr.Button(value="Continue previous topic extraction", variant="primary")
     with gr.Tab(label="Verify descriptions"):
-        gr.Markdown(
-        """
-        ### Choose a tabular data file (xlsx or csv) with titles and original text to verify descriptions for.
-        """
-        )
         with gr.Row():
             verify_model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
             verify_in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
@@ -227,10 +209,7 @@ with app:
         verify_modification_input_files_placeholder = gr.File(height=file_input_height, label="Placeholder for files to avoid errors", visible=False)
     with gr.Tab(label="Topic extraction settings"):
-        gr.Markdown(
-        """
-        Define settings that affect large language model output.
-        """)
         with gr.Accordion("Settings for LLM generation", open = True):
             temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Choose LLM temperature setting")
             batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = BATCH_SIZE_DEFAULT, precision=0, minimum=1, maximum=100)
@@ -296,32 +275,28 @@ with app:
     success(fn=wrapper_extract_topics_per_column_value,
         inputs=[in_group_col,
                 in_data_files,
                 file_data_state,
                 master_topic_df_state,
                 master_reference_df_state,
                 master_unique_topics_df_state,
                 display_topic_table_markdown,
                 reference_data_file_name_textbox,
                 total_number_of_batches,
                 in_api_key,
                 temperature_slide,
                 in_colnames,
                 model_choice,
                 candidate_topics,
                 first_loop_state,
                 conversation_metadata_textbox,
                 latest_batch_completed,
-                estimated_time_taken_number,
                 initial_table_prompt_textbox,
                 prompt_2_textbox,
                 prompt_3_textbox,
                 system_prompt_textbox,
-                add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox,
                 number_of_prompts,
                 batch_size_number,
                 context_textbox,
@@ -371,14 +346,14 @@ with app:
     # When button pressed, summarise previous data
     summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, overall_summarisation_input_files]).\
         success(load_in_previous_data_files, inputs=[summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
-            success(sample_reference_table_summaries, inputs=[master_reference_df_state, master_unique_topics_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown, master_reference_df_state, master_unique_topics_df_state], api_name="sample_summaries").\
                 success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], api_name="summarise_topics")
     latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], scroll_to_output=True)
     # SUMMARISE WHOLE TABLE PAGE
     overall_summarise_previous_data_btn.click(load_in_previous_data_files, inputs=[overall_summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
-            success(overall_summary, inputs=[master_unique_topics_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, output_folder_state], outputs=[overall_summary_output_files, overall_summarised_output_markdown], scroll_to_output=True, api_name="overall_summary")
     ###
     # CONTINUE PREVIOUS TOPIC EXTRACTION PAGE

     # UI LAYOUT
     ###
+    gr.Markdown("""# Large language model topic modelling
     Extract topics and summarise outputs using Large Language Models (LLMs, a Gemma model if local, Gemini Flash/Pro, or Claude 3 through AWS Bedrock if running on AWS). The app will query the LLM with batches of responses to produce summary tables, which are then compared iteratively to output a table with the general topics, subtopics, topic sentiment, and relevant text rows related to them. The prompts are designed for topic modelling public consultations, but they can be adapted to different contexts (see the LLM settings tab to modify).
     Instructions on use can be found in the README.md file. Try it out with this [dummy development consultation dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation), which you can also try with [zero-shot topics](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation/blob/main/example_zero_shot.csv), or this [dummy case notes dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_case_notes).
+    You can use an AWS Bedrock model (Claude 3, paid), or Gemini (a free API, but with strict limits for the Pro model). The use of Gemini models requires an API key. To set up your own Gemini API key, go [here](https://aistudio.google.com/app/u/1/plan_information).
     NOTE: that **API calls to Gemini are not considered secure**, so please only submit redacted, non-sensitive tabular files to this source. Also, large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
     with gr.Tab(label="Extract topics"):
+        gr.Markdown("""### Choose a tabular data file (xlsx or csv) of open text to extract topics from.""")
         with gr.Row():
             model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
             in_api_key = gr.Textbox(value = GEMINI_API_KEY, label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
             s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
     with gr.Tab(label="Modify, deduplicate, and summarise topic outputs"):
+        gr.Markdown("""Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to modify topics, deduplicate topics, or summarise the outputs. If you want pivot table outputs, please load in the original data file along with the selected open text column on the first tab before deduplicating or summarising.""")
         with gr.Accordion("Modify existing topics", open = False):
             modification_input_files = gr.File(height=file_input_height, label="Upload files to modify topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
         overall_summarised_output_markdown = gr.Markdown(value="### Overall summary will appear here", show_copy_button=True)
     with gr.Tab(label="Topic table viewer"):
+        gr.Markdown("""### View a 'unique_topic_table' csv file in markdown format.""")
         in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
         view_table_markdown = gr.Markdown(value = "", label="View table", show_copy_button=True)
     with gr.Tab(label="Continue unfinished topic extraction"):
+        gr.Markdown("""### Load in output files from a previous topic extraction process and continue topic extraction with new data.""")
         with gr.Accordion("Upload reference data file and unique data files", open = True):
             in_previous_data_files = gr.File(height=file_input_height, label="Choose output csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
             continue_previous_data_files_btn = gr.Button(value="Continue previous topic extraction", variant="primary")
     with gr.Tab(label="Verify descriptions"):
+        gr.Markdown("""### Choose a tabular data file (xlsx or csv) with titles and original text to verify descriptions for.""")
         with gr.Row():
             verify_model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
             verify_in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
         verify_modification_input_files_placeholder = gr.File(height=file_input_height, label="Placeholder for files to avoid errors", visible=False)
     with gr.Tab(label="Topic extraction settings"):
+        gr.Markdown("""Define settings that affect large language model output.""")
         with gr.Accordion("Settings for LLM generation", open = True):
             temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Choose LLM temperature setting")
             batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = BATCH_SIZE_DEFAULT, precision=0, minimum=1, maximum=100)
     success(fn=wrapper_extract_topics_per_column_value,
         inputs=[in_group_col,
                 in_data_files,
                 file_data_state,
                 master_topic_df_state,
                 master_reference_df_state,
                 master_unique_topics_df_state,
                 display_topic_table_markdown,
                 reference_data_file_name_textbox,
                 total_number_of_batches,
                 in_api_key,
                 temperature_slide,
                 in_colnames,
                 model_choice,
                 candidate_topics,
                 first_loop_state,
                 conversation_metadata_textbox,
                 latest_batch_completed,
+                estimated_time_taken_number,
                 initial_table_prompt_textbox,
                 prompt_2_textbox,
                 prompt_3_textbox,
                 system_prompt_textbox,
+                add_to_existing_topics_system_prompt_textbox,
+                add_to_existing_topics_prompt_textbox,
                 number_of_prompts,
                 batch_size_number,
                 context_textbox,
     # When button pressed, summarise previous data
     summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, overall_summarisation_input_files]).\
         success(load_in_previous_data_files, inputs=[summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
+            success(sample_reference_table_summaries, inputs=[master_reference_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown], api_name="sample_summaries").\
                 success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], api_name="summarise_topics")
     latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], scroll_to_output=True)
     # SUMMARISE WHOLE TABLE PAGE
     overall_summarise_previous_data_btn.click(load_in_previous_data_files, inputs=[overall_summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
+            success(overall_summary, inputs=[master_unique_topics_df_state, model_choice, in_api_key, temperature_slide, unique_topics_table_file_name_textbox, summarised_outputs_list, output_folder_state], outputs=[overall_summary_output_files, overall_summarised_output_markdown], scroll_to_output=True, api_name="overall_summary")
     ###
     # CONTINUE PREVIOUS TOPIC EXTRACTION PAGE

tools/dedup_summaries.py CHANGED Viewed

@@ -9,7 +9,7 @@ from tqdm import tqdm
 from tools.prompts import summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, system_prompt, summarise_everything_prompt, comprehensive_summary_format_prompt
 from tools.llm_funcs import construct_gemini_generative_model, process_requests, ResponseObject, load_model
-from tools.helper_functions import create_topic_summary_df_from_reference_table, load_in_data_file, get_basic_response_data, convert_reference_table_to_pivot_table, wrap_text
 from tools.config import OUTPUT_FOLDER, RUN_LOCAL_MODEL, MAX_COMMENT_CHARS, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED
 max_tokens = MAX_TOKENS
@@ -158,17 +158,17 @@ def deduplicate_topics(reference_df:pd.DataFrame,
         print(out_message)
         #raise Exception(out_message)
     # Run through this x times to try to get all duplicate topics
     if deduplicate_topics == "Yes":
         for i in range(0, 8):
             if merge_sentiment == "No":
                 if merge_general_topics == "No":
                     reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
                     reference_df_unique = reference_df.drop_duplicates("old_category")
-                    deduplicated_topic_map_df = reference_df_unique.groupby(["General Topic", "Sentiment"]).apply(
                         lambda group: deduplicate_categories(
                             group["Subtopic"],
                             group["Sentiment"],
@@ -233,8 +233,6 @@ def deduplicate_topics(reference_df:pd.DataFrame,
                 # Remove rows where 'deduplicated_category' is blank or NaN
                 deduplicated_topic_map_df = deduplicated_topic_map_df.loc[(deduplicated_topic_map_df['deduplicated_category'].str.strip() != '') & ~(deduplicated_topic_map_df['deduplicated_category'].isnull()), ['old_category','deduplicated_category', 'match_score']]
-                #deduplicated_topic_map_df.to_csv(output_folder + "deduplicated_topic_map_df_" + str(i) + ".csv", index=None)
                 reference_df = reference_df.merge(deduplicated_topic_map_df, on="old_category", how="left")
                 reference_df.rename(columns={"Subtopic": "Subtopic_old", "Sentiment": "Sentiment_old"}, inplace=True)
@@ -246,14 +244,9 @@ def deduplicate_topics(reference_df:pd.DataFrame,
                 reference_df["Subtopic"] = reference_df["deduplicated_category"].combine_first(reference_df["Subtopic_old"])
                 reference_df["Sentiment"] = reference_df["Sentiment"].combine_first(reference_df["Sentiment_old"])
-            reference_df.drop(['old_category', 'deduplicated_category', "Subtopic_old", "Sentiment_old"], axis=1, inplace=True, errors="ignore")
-            reference_df = reference_df[["Response References", "General Topic", "Subtopic", "Sentiment", "Summary", "Start row of group"]]
-            #reference_df["General Topic"] = reference_df["General Topic"].str.lower().str.capitalize()
-            #reference_df["Subtopic"] = reference_df["Subtopic"].str.lower().str.capitalize()
-            #reference_df["Sentiment"] = reference_df["Sentiment"].str.lower().str.capitalize()
             if merge_general_topics == "Yes":
                 # Replace General topic names for each Subtopic with that for the Subtopic with the most responses
@@ -285,8 +278,10 @@ def deduplicate_topics(reference_df:pd.DataFrame,
                 # Clean up the DataFrame by dropping the UniqueCount column
                 reference_df.drop(columns=['UniqueCount'], inplace=True)
-            reference_df = reference_df[["Response References", "General Topic", "Subtopic", "Sentiment", "Summary", "Start row of group"]]
         # Update reference summary column with all summaries
         reference_df["Summary"] = reference_df.groupby(
         ["Response References", "General Topic", "Subtopic", "Sentiment"]
@@ -301,19 +296,17 @@ def deduplicate_topics(reference_df:pd.DataFrame,
         # Drop duplicates in the reference table - each comment should only have the same topic referred to once
         reference_df.drop_duplicates(['Response References', 'General Topic', 'Subtopic', 'Sentiment'], inplace=True)
         # Remake topic_summary_df based on new reference_df
         topic_summary_df = create_topic_summary_df_from_reference_table(reference_df)
         # Then merge the topic numbers back to the original dataframe
         reference_df = reference_df.merge(
-            topic_summary_df[['General Topic', 'Subtopic', 'Sentiment', 'Topic_number']],
-            on=['General Topic', 'Subtopic', 'Sentiment'],
             how='left'
         )
-    else:
-        print("Topics have not beeen deduplicated")
     if not file_data.empty:
@@ -337,13 +330,11 @@ def deduplicate_topics(reference_df:pd.DataFrame,
     # Outputs for markdown table output
     topic_summary_df_revised_display = topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
     deduplicated_unique_table_markdown = topic_summary_df_revised_display.to_markdown(index=False)
     return reference_df, topic_summary_df, output_files, log_output_files, deduplicated_unique_table_markdown
 def sample_reference_table_summaries(reference_df:pd.DataFrame,
-                                     topic_summary_df:pd.DataFrame,
                                      random_seed:int,
                                      no_of_sampled_summaries:int=150):
@@ -354,7 +345,10 @@ def sample_reference_table_summaries(reference_df:pd.DataFrame,
     all_summaries = pd.DataFrame()
     output_files = []
-    reference_df_grouped = reference_df.groupby(["General Topic", "Subtopic", "Sentiment"])
     if 'Revised summary' in reference_df.columns:
         out_message = "Summary has already been created for this file"
@@ -389,7 +383,7 @@ def sample_reference_table_summaries(reference_df:pd.DataFrame,
     summarised_references_markdown = summarised_references.to_markdown(index=False)
-    return summarised_references, summarised_references_markdown, reference_df, topic_summary_df
 def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:float, formatted_summary_prompt:str, summarise_topic_descriptions_system_prompt:str, local_model=[]):
     conversation_history = []
@@ -453,7 +447,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
     out_metadata = []
     local_model = []
     summarised_output_markdown = ""
-    output_files = []
     # Check for data for summarisations
     if not topic_summary_df.empty and not reference_table_df.empty:
@@ -475,11 +469,12 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
         out_message = "No file data found, pivot table output will not be created."
         print(out_message)
         raise Exception(out_message)
-    try:
-        all_summaries = summarised_references["Summary"].tolist()
-    except:
-        all_summaries = summarised_references["Revised summary"].tolist()
     length_all_summaries = len(all_summaries)
@@ -488,16 +483,14 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
         print("All summaries completed. Creating outputs.")
         model_choice_clean = model_name_map[model_choice]
-        file_name = re.search(r'(.*?)(?:_batch_|_col_)', table_file_name).group(1) if re.search(r'(.*?)(?:_batch_|_col_)', table_file_name) else table_file_name
         latest_batch_completed = int(re.search(r'batch_(\d+)_', table_file_name).group(1)) if 'batch_' in table_file_name else ""
         batch_size_number = int(re.search(r'size_(\d+)_', table_file_name).group(1)) if 'size_' in table_file_name else ""
         in_column_cleaned = re.search(r'col_(.*?)_reference', table_file_name).group(1) if 'col_' in table_file_name else ""
         # Save outputs for each batch. If master file created, label file as master
-        if latest_batch_completed:
-            batch_file_path_details = f"{file_name}_batch_{latest_batch_completed}_size_{batch_size_number}_col_{in_column_cleaned}"
-        else:
-            batch_file_path_details = f"{file_name}_col_{in_column_cleaned}"
         summarised_references["Revised summary"] = summarised_outputs
@@ -511,7 +504,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
         # If no new summary is available, keep the original
         topic_summary_df_revised["Revised summary"] = topic_summary_df_revised["Revised summary"].combine_first(topic_summary_df_revised["Summary"])
-        topic_summary_df_revised = topic_summary_df_revised[["General Topic", "Subtopic", "Sentiment", "Number of responses", "Revised summary"]]
         # Replace all instances of 'Rows X to Y:' that remain on some topics that have not had additional summaries
         topic_summary_df_revised["Revised summary"] = topic_summary_df_revised["Revised summary"].str.replace("^Rows\s+\d+\s+to\s+\d+:\s*", "", regex=True)
@@ -545,7 +538,6 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
         ###
         topic_summary_df_revised_display = topic_summary_df_revised.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
         summarised_output_markdown = topic_summary_df_revised_display.to_markdown(index=False)
         # Ensure same file name not returned twice
@@ -560,16 +552,14 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
     #print("Last summary number:", length_all_summaries)
     if (model_choice == CHOSEN_LOCAL_MODEL_TYPE) & (RUN_LOCAL_MODEL == "1"):
-                progress(0.1, f"Loading in local model: {CHOSEN_LOCAL_MODEL_TYPE}")
-                local_model, tokenizer = load_model(local_model_type=CHOSEN_LOCAL_MODEL_TYPE, repo_id=LOCAL_REPO_ID, model_filename=LOCAL_MODEL_FILE, model_dir=LOCAL_MODEL_FOLDER)
-                #print("Local model loaded:", local_model)
     summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
     summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")
     if do_summaries == "Yes":
         for summary_no in summary_loop:
             print("Current summary number is:", summary_no)
             summary_text = all_summaries[summary_no]
@@ -631,8 +621,9 @@ def overall_summary(topic_summary_df:pd.DataFrame,
     latest_summary_completed = 0
     output_files = []
-    model_choice_clean = model_name_map[model_choice]
-    file_name = re.search(r'(.*?)(?:_batch_|_col_)', table_file_name).group(1) if re.search(r'(.*?)(?:_batch_|_col_)', table_file_name) else table_file_name
     latest_batch_completed = int(re.search(r'batch_(\d+)_', table_file_name).group(1)) if 'batch_' in table_file_name else ""
     batch_size_number = int(re.search(r'size_(\d+)_', table_file_name).group(1)) if 'size_' in table_file_name else ""
     in_column_cleaned = re.search(r'col_(.*?)_reference', table_file_name).group(1) if 'col_' in table_file_name else ""
@@ -659,14 +650,10 @@ def overall_summary(topic_summary_df:pd.DataFrame,
     if do_summaries == "Yes":
         for summary_no in summary_loop:
-            print("Current summary number is:", summary_no)
             summary_text = topic_summary_df.to_markdown(index=False)
-            #print("summary_text:", summary_text)
             formatted_summary_prompt = [summarise_everything_prompt.format(topic_summary_table=summary_text, summary_format=comprehensive_summary_format_prompt)]
-            #print("formatted_summary_prompt:", formatted_summary_prompt)
             try:
                 response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, summarise_topic_descriptions_system_prompt, local_model)
                 summarised_output = response
@@ -687,16 +674,17 @@ def overall_summary(topic_summary_df:pd.DataFrame,
             toc = time.perf_counter()
             time_taken = tic - toc
-            # Define the output file path for the formatted prompt
-            formatted_prompt_output_path = output_folder + batch_file_path_details + "_overall_summary_" + model_choice_clean + ".txt"
             # Write the formatted prompt to the specified file
             try:
-                with open(formatted_prompt_output_path, "w", encoding='utf-8', errors='replace') as f:
                     f.write(summarised_output)
-                output_files.append(formatted_prompt_output_path)
             except Exception as e:
-                print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
             output_files = list(set(output_files))

 from tools.prompts import summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, system_prompt, summarise_everything_prompt, comprehensive_summary_format_prompt
 from tools.llm_funcs import construct_gemini_generative_model, process_requests, ResponseObject, load_model
+from tools.helper_functions import create_topic_summary_df_from_reference_table, load_in_data_file, get_basic_response_data, convert_reference_table_to_pivot_table, wrap_text, clean_column_name
 from tools.config import OUTPUT_FOLDER, RUN_LOCAL_MODEL, MAX_COMMENT_CHARS, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED
 max_tokens = MAX_TOKENS
         print(out_message)
         #raise Exception(out_message)
     # Run through this x times to try to get all duplicate topics
     if deduplicate_topics == "Yes":
+        if "Group" not in reference_df.columns:
+            reference_df["Group"] = "All"
         for i in range(0, 8):
             if merge_sentiment == "No":
                 if merge_general_topics == "No":
                     reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
                     reference_df_unique = reference_df.drop_duplicates("old_category")
+                    deduplicated_topic_map_df = reference_df_unique.groupby(["General Topic", "Sentiment", "Group"]).apply(
                         lambda group: deduplicate_categories(
                             group["Subtopic"],
                             group["Sentiment"],
                 # Remove rows where 'deduplicated_category' is blank or NaN
                 deduplicated_topic_map_df = deduplicated_topic_map_df.loc[(deduplicated_topic_map_df['deduplicated_category'].str.strip() != '') & ~(deduplicated_topic_map_df['deduplicated_category'].isnull()), ['old_category','deduplicated_category', 'match_score']]
                 reference_df = reference_df.merge(deduplicated_topic_map_df, on="old_category", how="left")
                 reference_df.rename(columns={"Subtopic": "Subtopic_old", "Sentiment": "Sentiment_old"}, inplace=True)
                 reference_df["Subtopic"] = reference_df["deduplicated_category"].combine_first(reference_df["Subtopic_old"])
                 reference_df["Sentiment"] = reference_df["Sentiment"].combine_first(reference_df["Sentiment_old"])
+            #reference_df.drop(['old_category', 'deduplicated_category', "Subtopic_old", "Sentiment_old"], axis=1, inplace=True, errors="ignore")
+            #print("reference_df:", reference_df)
+            reference_df = reference_df[["Response References", "General Topic", "Subtopic", "Sentiment", "Summary", "Start row of group", "Group"]]
             if merge_general_topics == "Yes":
                 # Replace General topic names for each Subtopic with that for the Subtopic with the most responses
                 # Clean up the DataFrame by dropping the UniqueCount column
                 reference_df.drop(columns=['UniqueCount'], inplace=True)
+            #print("reference_df:", reference_df)
+            reference_df = reference_df[["Response References", "General Topic", "Subtopic", "Sentiment", "Summary", "Start row of group", "Group"]]
+            #reference_df.drop(['old_category', 'deduplicated_category', "Subtopic_old", "Sentiment_old"], axis=1, inplace=True, errors="ignore")
         # Update reference summary column with all summaries
         reference_df["Summary"] = reference_df.groupby(
         ["Response References", "General Topic", "Subtopic", "Sentiment"]
         # Drop duplicates in the reference table - each comment should only have the same topic referred to once
         reference_df.drop_duplicates(['Response References', 'General Topic', 'Subtopic', 'Sentiment'], inplace=True)
         # Remake topic_summary_df based on new reference_df
         topic_summary_df = create_topic_summary_df_from_reference_table(reference_df)
         # Then merge the topic numbers back to the original dataframe
         reference_df = reference_df.merge(
+            topic_summary_df[['General Topic', 'Subtopic', 'Sentiment', 'Group', 'Topic_number']],
+            on=['General Topic', 'Subtopic', 'Sentiment', 'Group'],
             how='left'
         )
+    else: print("Topics have not beeen deduplicated")
     if not file_data.empty:
     # Outputs for markdown table output
     topic_summary_df_revised_display = topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
     deduplicated_unique_table_markdown = topic_summary_df_revised_display.to_markdown(index=False)
     return reference_df, topic_summary_df, output_files, log_output_files, deduplicated_unique_table_markdown
 def sample_reference_table_summaries(reference_df:pd.DataFrame,
                                      random_seed:int,
                                      no_of_sampled_summaries:int=150):
     all_summaries = pd.DataFrame()
     output_files = []
+    if "Group" not in reference_df.columns:
+        reference_df["Group"] = "All"
+    reference_df_grouped = reference_df.groupby(["General Topic", "Subtopic", "Sentiment", "Group"])
     if 'Revised summary' in reference_df.columns:
         out_message = "Summary has already been created for this file"
     summarised_references_markdown = summarised_references.to_markdown(index=False)
+    return summarised_references, summarised_references_markdown#, reference_df, topic_summary_df
 def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:float, formatted_summary_prompt:str, summarise_topic_descriptions_system_prompt:str, local_model=[]):
     conversation_history = []
     out_metadata = []
     local_model = []
     summarised_output_markdown = ""
+    output_files = []
     # Check for data for summarisations
     if not topic_summary_df.empty and not reference_table_df.empty:
         out_message = "No file data found, pivot table output will not be created."
         print(out_message)
         raise Exception(out_message)
+    if "Group" not in reference_table_df.columns: reference_table_df["Group"] = "All"
+    if "Group" not in topic_summary_df.columns: topic_summary_df["Group"] = "All"
+    try: all_summaries = summarised_references["Summary"].tolist()
+    except: all_summaries = summarised_references["Revised summary"].tolist()
     length_all_summaries = len(all_summaries)
         print("All summaries completed. Creating outputs.")
         model_choice_clean = model_name_map[model_choice]
+        file_name = re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name).group(1) if re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name) else table_file_name
         latest_batch_completed = int(re.search(r'batch_(\d+)_', table_file_name).group(1)) if 'batch_' in table_file_name else ""
         batch_size_number = int(re.search(r'size_(\d+)_', table_file_name).group(1)) if 'size_' in table_file_name else ""
         in_column_cleaned = re.search(r'col_(.*?)_reference', table_file_name).group(1) if 'col_' in table_file_name else ""
         # Save outputs for each batch. If master file created, label file as master
+        if latest_batch_completed: batch_file_path_details = f"{file_name}_batch_{latest_batch_completed}_size_{batch_size_number}_col_{in_column_cleaned}"
+        else: batch_file_path_details = f"{file_name}_col_{in_column_cleaned}"
         summarised_references["Revised summary"] = summarised_outputs
         # If no new summary is available, keep the original
         topic_summary_df_revised["Revised summary"] = topic_summary_df_revised["Revised summary"].combine_first(topic_summary_df_revised["Summary"])
+        topic_summary_df_revised = topic_summary_df_revised[["General Topic", "Subtopic", "Sentiment", "Group", "Number of responses", "Revised summary"]]
         # Replace all instances of 'Rows X to Y:' that remain on some topics that have not had additional summaries
         topic_summary_df_revised["Revised summary"] = topic_summary_df_revised["Revised summary"].str.replace("^Rows\s+\d+\s+to\s+\d+:\s*", "", regex=True)
         ###
         topic_summary_df_revised_display = topic_summary_df_revised.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
         summarised_output_markdown = topic_summary_df_revised_display.to_markdown(index=False)
         # Ensure same file name not returned twice
     #print("Last summary number:", length_all_summaries)
     if (model_choice == CHOSEN_LOCAL_MODEL_TYPE) & (RUN_LOCAL_MODEL == "1"):
+        progress(0.1, f"Loading in local model: {CHOSEN_LOCAL_MODEL_TYPE}")
+        local_model, tokenizer = load_model(local_model_type=CHOSEN_LOCAL_MODEL_TYPE, repo_id=LOCAL_REPO_ID, model_filename=LOCAL_MODEL_FILE, model_dir=LOCAL_MODEL_FOLDER)
     summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
     summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")
     if do_summaries == "Yes":
         for summary_no in summary_loop:
             print("Current summary number is:", summary_no)
             summary_text = all_summaries[summary_no]
     latest_summary_completed = 0
     output_files = []
+    model_choice_clean = model_name_map[model_choice]
+    model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
+    file_name = re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name).group(1) if re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name) else table_file_name
     latest_batch_completed = int(re.search(r'batch_(\d+)_', table_file_name).group(1)) if 'batch_' in table_file_name else ""
     batch_size_number = int(re.search(r'size_(\d+)_', table_file_name).group(1)) if 'size_' in table_file_name else ""
     in_column_cleaned = re.search(r'col_(.*?)_reference', table_file_name).group(1) if 'col_' in table_file_name else ""
     if do_summaries == "Yes":
         for summary_no in summary_loop:
             summary_text = topic_summary_df.to_markdown(index=False)
             formatted_summary_prompt = [summarise_everything_prompt.format(topic_summary_table=summary_text, summary_format=comprehensive_summary_format_prompt)]
             try:
                 response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, summarise_topic_descriptions_system_prompt, local_model)
                 summarised_output = response
             toc = time.perf_counter()
             time_taken = tic - toc
+            # Define the output file path for the output
+            print("batch_file_path_details just before save:", batch_file_path_details)
+            overall_summary_output_path = output_folder + batch_file_path_details + "_overall_summary_" + model_choice_clean_short + "_temp_" + str(temperature) + ".txt"
             # Write the formatted prompt to the specified file
             try:
+                with open(overall_summary_output_path, "w", encoding='utf-8', errors='replace') as f:
                     f.write(summarised_output)
+                output_files.append(overall_summary_output_path)
             except Exception as e:
+                print(f"Error writing prompt to file {overall_summary_output_path}: {e}")
             output_files = list(set(output_files))

tools/helper_functions.py CHANGED Viewed

@@ -32,7 +32,6 @@ def empty_output_vars_extract_topics():
     return master_topic_df_state, master_topic_summary_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown, summary_output_file_list, summary_input_file_list, overall_summarisation_input_files, overall_summary_output_files
 def empty_output_vars_summarise():
     # Empty output objects before summarising files
@@ -47,7 +46,7 @@ def empty_output_vars_summarise():
     return summary_reference_table_sample_state, master_topic_summary_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, overall_summarisation_input_files
-def get_or_create_env_var(var_name, default_value):
     # Get the environment variable if it exists
     value = os.environ.get(var_name)
@@ -58,14 +57,14 @@ def get_or_create_env_var(var_name, default_value):
     return value
-def get_file_path_with_extension(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
     basename = os.path.basename(file_path)
     # Return the basename with its extension
     return basename
-def get_file_name_no_ext(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
     basename = os.path.basename(file_path)
@@ -76,7 +75,7 @@ def get_file_name_no_ext(file_path):
     return filename_without_extension
-def detect_file_type(filename):
     """Detect the file type based on its extension."""
     if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
         return 'csv'
@@ -232,7 +231,6 @@ def join_cols_onto_reference_df(reference_df:pd.DataFrame, original_data_df:pd.D
     return out_reference_df, file_data_outputs
 def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str], verify_titles:bool=False) -> pd.DataFrame:
     if not isinstance(chosen_cols, list):
@@ -253,9 +251,7 @@ def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str], verif
         basic_response_data = basic_response_data[['Reference', 'Response', 'Original Reference']]
     basic_response_data["Response"] = basic_response_data["Response"].str.strip()
-    basic_response_data["Response"] = basic_response_data["Response"].apply(initial_clean)
-    print("basic_response_data:", basic_response_data)
     return basic_response_data
@@ -291,7 +287,10 @@ def convert_reference_table_to_pivot_table(df:pd.DataFrame, basic_response_data:
 def create_topic_summary_df_from_reference_table(reference_df:pd.DataFrame):
-    out_topic_summary_df = (reference_df.groupby(["General Topic", "Subtopic", "Sentiment"])
             .agg({
                 'Response References': 'size',  # Count the number of references
                 'Summary': lambda x: '<br>'.join(
@@ -299,12 +298,14 @@ def create_topic_summary_df_from_reference_table(reference_df:pd.DataFrame):
                 )
             })
             .reset_index()
-            .sort_values('Response References', ascending=False)  # Sort by size, biggest first
             .assign(Topic_number=lambda df: np.arange(1, len(df) + 1))  # Add numbering 1 to x
         )
     out_topic_summary_df = out_topic_summary_df.rename(columns={"Response References": "Number of responses"}, errors="ignore")
     return out_topic_summary_df
 # Wrap text in each column to the specified max width, including whole words

     return master_topic_df_state, master_topic_summary_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown, summary_output_file_list, summary_input_file_list, overall_summarisation_input_files, overall_summary_output_files
 def empty_output_vars_summarise():
     # Empty output objects before summarising files
     return summary_reference_table_sample_state, master_topic_summary_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, overall_summarisation_input_files
+def get_or_create_env_var(var_name:str, default_value:str):
     # Get the environment variable if it exists
     value = os.environ.get(var_name)
     return value
+def get_file_path_with_extension(file_path:str):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
     basename = os.path.basename(file_path)
     # Return the basename with its extension
     return basename
+def get_file_name_no_ext(file_path:str):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
     basename = os.path.basename(file_path)
     return filename_without_extension
+def detect_file_type(filename:str):
     """Detect the file type based on its extension."""
     if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
         return 'csv'
     return out_reference_df, file_data_outputs
 def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str], verify_titles:bool=False) -> pd.DataFrame:
     if not isinstance(chosen_cols, list):
         basic_response_data = basic_response_data[['Reference', 'Response', 'Original Reference']]
     basic_response_data["Response"] = basic_response_data["Response"].str.strip()
+    basic_response_data["Response"] = basic_response_data["Response"].apply(initial_clean)
     return basic_response_data
 def create_topic_summary_df_from_reference_table(reference_df:pd.DataFrame):
+    if "Group" not in reference_df.columns:
+        reference_df["Group"] = "All"
+    out_topic_summary_df = (reference_df.groupby(["General Topic", "Subtopic", "Sentiment", "Group"])
             .agg({
                 'Response References': 'size',  # Count the number of references
                 'Summary': lambda x: '<br>'.join(
                 )
             })
             .reset_index()
+            #.sort_values('Response References', ascending=False)  # Sort by size, biggest first
             .assign(Topic_number=lambda df: np.arange(1, len(df) + 1))  # Add numbering 1 to x
         )
     out_topic_summary_df = out_topic_summary_df.rename(columns={"Response References": "Number of responses"}, errors="ignore")
+    out_topic_summary_df = out_topic_summary_df.sort_values(["Group", "Number of responses", "General Topic", "Subtopic", "Sentiment"], ascending=[True, False, True, True, True])
     return out_topic_summary_df
 # Wrap text in each column to the specified max width, including whole words

tools/llm_api_call.py CHANGED Viewed

@@ -421,7 +421,6 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
     reference_table_out_path = "reference_table_error.csv"
     topic_summary_df_out_path = "unique_topic_table_error.csv"
     topic_with_response_df = pd.DataFrame()
-    markdown_table = ""
     out_reference_df = pd.DataFrame()
     out_topic_summary_df = pd.DataFrame()
     batch_file_path_details = "error"
@@ -461,7 +460,7 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
         topic_with_response_df, is_error = convert_response_text_to_dataframe(response_text)
     except Exception as e:
         print("Error in parsing markdown table from response text:", e)
-        return topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_topic_summary_df, batch_file_path_details, is_error
     # Rename columns to ensure consistent use of data frames later in code
     new_column_names = {
@@ -607,9 +606,11 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
     out_topic_summary_df = out_topic_summary_df.rename(columns={"Response References":"Number of responses"}, errors="ignore")
     topic_summary_df_out_path = output_folder + batch_file_path_details + "_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
-    return topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_topic_summary_df, batch_file_path_details, is_error
 def generate_zero_shot_topics_df(zero_shot_topics:pd.DataFrame,
                                  force_zero_shot_radio:str="No",
@@ -988,7 +989,7 @@ def extract_topics(in_data_file,
                         full_prompt = formatted_system_prompt + formatted_summary_prompt
                     # Define the output file path for the formatted prompt
-                    formatted_prompt_output_path = output_folder + clean_column_name(file_name, max_length=30, front_characters=False) + "_" + str(reported_batch_no) +  "_full_prompt_" + clean_column_name(model_choice_clean, max_length = 20, front_characters=False) + "_temp_" + str(temperature) + ".txt"
                     # Write the formatted prompt to the specified file
                     try:
@@ -1009,7 +1010,7 @@ def extract_topics(in_data_file,
                     responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
                     # Return output tables
-                    topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, new_topic_df, new_markdown_table, new_reference_df, new_topic_summary_df, master_batch_out_file_part, is_error =  write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, group_name, first_run=False, output_folder=output_folder)
                     # Write final output to text file for logging purposes
                     try:
@@ -1046,6 +1047,8 @@ def extract_topics(in_data_file,
                     ## Unique topic list
                     new_topic_summary_df = pd.concat([new_topic_summary_df, existing_topic_summary_df]).drop_duplicates('Subtopic')
                     new_topic_summary_df.to_csv(topic_summary_df_out_path, index=None)
                     out_file_paths.append(topic_summary_df_out_path)
@@ -1101,7 +1104,7 @@ def extract_topics(in_data_file,
                     responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS)
-                    topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_table_df, markdown_table, reference_df, new_topic_summary_df, batch_file_path_details, is_error =  write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, group_name, first_run=True, output_folder=output_folder)
                     # If error in table parsing, leave function
                     if is_error == True:
@@ -1121,6 +1124,8 @@ def extract_topics(in_data_file,
                     new_topic_summary_df = pd.concat([new_topic_summary_df, existing_topic_summary_df]).drop_duplicates('Subtopic')
                     new_topic_summary_df.to_csv(topic_summary_df_out_path, index=None)
                     out_file_paths.append(topic_summary_df_out_path)
@@ -1131,20 +1136,23 @@ def extract_topics(in_data_file,
                     # Write final output to text file also
                     try:
-                        final_table_output_path = output_folder + batch_file_path_details + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
-                        if isinstance(responses[-1], ResponseObject):
-                            with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
-                                f.write(responses[-1].text)
-                            unique_table_df_display_table_markdown = responses[-1].text
-                        elif "choices" in responses[-1]:
-                            with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
-                                f.write(responses[-1]["choices"][0]['text'])
-                            unique_table_df_display_table_markdown =responses[-1]["choices"][0]['text']
-                        else:
-                            with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
-                                f.write(responses[-1].text)
-                            unique_table_df_display_table_markdown = responses[-1].text
                         log_files_output_paths.append(final_table_output_path)
@@ -1203,7 +1211,7 @@ def extract_topics(in_data_file,
         print("All summaries completed. Creating outputs.")
-        model_choice_clean = clean_column_name(model_name_map[model_choice], max_length=20, front_characters=False)
         # Example usage
         in_column_cleaned = clean_column_name(chosen_cols, max_length=20)
@@ -1214,14 +1222,13 @@ def extract_topics(in_data_file,
         file_path_details = f"{file_name_cleaned}_col_{in_column_cleaned}"
         # Create a pivoted reference table
-        existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
         # Save the new DataFrame to CSV
-        #topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
-        reference_table_out_pivot_path = output_folder + file_path_details + "_final_reference_table_pivot_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
-        reference_table_out_path = output_folder + file_path_details + "_final_reference_table_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
-        topic_summary_df_out_path = output_folder + file_path_details + "_final_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
-        basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
         ## Reference table mapping response numbers to topics
         existing_reference_df.to_csv(reference_table_out_path, index=None)
@@ -1230,30 +1237,33 @@ def extract_topics(in_data_file,
         # Create final unique topics table from reference table to ensure consistent numbers
         final_out_topic_summary_df = create_topic_summary_df_from_reference_table(existing_reference_df)
         ## Unique topic list
         final_out_topic_summary_df.to_csv(topic_summary_df_out_path, index=None, encoding='utf-8')
         out_file_paths.append(topic_summary_df_out_path)
         # Ensure that we are only returning the final results to outputs
         out_file_paths = [x for x in out_file_paths if '_final_' in x]
         ## Reference table mapping response numbers to topics
         existing_reference_df_pivot.to_csv(reference_table_out_pivot_path, index = None, encoding='utf-8')
         log_files_output_paths.append(reference_table_out_pivot_path)
         ## Create a dataframe for missing response references:
         # Assuming existing_reference_df and file_data are already defined
-        # Simplify table to just responses column and the Response reference number
         basic_response_data = get_basic_response_data(file_data, chosen_cols)
         # Save simplified file data to log outputs
         pd.DataFrame(basic_response_data).to_csv(basic_response_data_out_path, index=None, encoding='utf-8')
         log_files_output_paths.append(basic_response_data_out_path)
         # Step 1: Identify missing references
         missing_references = basic_response_data[~basic_response_data['Reference'].astype(str).isin(existing_reference_df['Response References'].astype(str).unique())]
@@ -1267,7 +1277,7 @@ def extract_topics(in_data_file,
         # Display the new DataFrame
         #print("missing_df:", missing_df)
-        missing_df_out_path = output_folder + file_path_details + "_missing_references_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
         missing_df.to_csv(missing_df_out_path, index=None, encoding='utf-8')
         log_files_output_paths.append(missing_df_out_path)
@@ -1281,10 +1291,10 @@ def extract_topics(in_data_file,
         print("latest_batch_completed at end of batch iterations to return is", latest_batch_completed)
-        return unique_table_df_display_table_markdown, existing_topics_table, final_out_topic_summary_df, existing_reference_df, final_out_file_paths, final_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, final_out_file_paths, final_out_file_paths, modifiable_topic_summary_df, final_out_file_paths, join_file_paths # gr.Dataframe(value=modifiable_topic_summary_df, headers=None, col_count=(modifiable_topic_summary_df.shape[1], "fixed"), row_count = (modifiable_topic_summary_df.shape[0], "fixed"), visible=True, type="pandas"),
-    return unique_table_df_display_table_markdown, existing_topics_table, existing_topic_summary_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, modifiable_topic_summary_df, out_file_paths, join_file_paths # gr.Dataframe(value=modifiable_topic_summary_df, headers=None, col_count=(modifiable_topic_summary_df.shape[1], "fixed"), row_count = (modifiable_topic_summary_df.shape[0], "fixed"), visible=True, type="pandas"),
 def wrapper_extract_topics_per_column_value(
     selected_col: str,
@@ -1350,6 +1360,7 @@ def wrapper_extract_topics_per_column_value(
     acc_topics_table = initial_existing_topics_table.copy()
     acc_reference_df = initial_existing_reference_df.copy()
     acc_topic_summary_df = initial_existing_topic_summary_df.copy()
     # Lists are extended
     acc_out_file_paths = []
@@ -1365,7 +1376,7 @@ def wrapper_extract_topics_per_column_value(
     wrapper_first_loop = initial_first_loop_state
-    for i, group_value in enumerate(unique_values):
         print(f"\nProcessing segment: {selected_col} = {group_value} ({i+1}/{len(unique_values)})")
         filtered_file_data = file_data.copy()
@@ -1412,6 +1423,7 @@ def wrapper_extract_topics_per_column_value(
                 seg_gradio_df,
                 _seg_out_files5, # Often same as 1
                 seg_join_files,
             ) = extract_topics(
                 in_data_file=in_data_file,
                 file_data=filtered_file_data,
@@ -1460,9 +1472,9 @@ def wrapper_extract_topics_per_column_value(
             # Aggregate results
             # The DFs returned by extract_topics are already cumulative for *its own run*.
             # We now make them cumulative for the *wrapper's run*.
-            acc_topics_table = seg_topics_table
-            acc_reference_df = seg_reference_df
-            acc_topic_summary_df = seg_topic_summary_df
             # For lists, extend. Use set to remove duplicates if paths might be re-added.
             acc_out_file_paths.extend(f for f in seg_out_files1 if f not in acc_out_file_paths)
@@ -1484,6 +1496,32 @@ def wrapper_extract_topics_per_column_value(
             # Optionally, decide if you want to continue with other segments or stop
             # For now, it will continue
             continue
     print(f"\nWrapper finished processing all segments. Total time: {acc_total_time_taken:.2f}s")

     reference_table_out_path = "reference_table_error.csv"
     topic_summary_df_out_path = "unique_topic_table_error.csv"
     topic_with_response_df = pd.DataFrame()
     out_reference_df = pd.DataFrame()
     out_topic_summary_df = pd.DataFrame()
     batch_file_path_details = "error"
         topic_with_response_df, is_error = convert_response_text_to_dataframe(response_text)
     except Exception as e:
         print("Error in parsing markdown table from response text:", e)
+        return topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_with_response_df, out_reference_df, out_topic_summary_df, batch_file_path_details, is_error
     # Rename columns to ensure consistent use of data frames later in code
     new_column_names = {
     out_topic_summary_df = out_topic_summary_df.rename(columns={"Response References":"Number of responses"}, errors="ignore")
+    out_topic_summary_df["Group"] = group_name
     topic_summary_df_out_path = output_folder + batch_file_path_details + "_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
+    return topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_with_response_df, out_reference_df, out_topic_summary_df, batch_file_path_details, is_error
 def generate_zero_shot_topics_df(zero_shot_topics:pd.DataFrame,
                                  force_zero_shot_radio:str="No",
                         full_prompt = formatted_system_prompt + formatted_summary_prompt
                     # Define the output file path for the formatted prompt
+                    formatted_prompt_output_path = output_folder + batch_file_path_details +  "_full_prompt_" + clean_column_name(model_choice_clean, max_length = 20, front_characters=False) + "_temp_" + str(temperature) + ".txt"
                     # Write the formatted prompt to the specified file
                     try:
                     responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
                     # Return output tables
+                    topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, new_topic_df, new_reference_df, new_topic_summary_df, master_batch_out_file_part, is_error =  write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, group_name, first_run=False, output_folder=output_folder)
                     # Write final output to text file for logging purposes
                     try:
                     ## Unique topic list
                     new_topic_summary_df = pd.concat([new_topic_summary_df, existing_topic_summary_df]).drop_duplicates('Subtopic')
+                    new_topic_summary_df["Group"] = group_name
                     new_topic_summary_df.to_csv(topic_summary_df_out_path, index=None)
                     out_file_paths.append(topic_summary_df_out_path)
                     responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS)
+                    topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_table_df, reference_df, new_topic_summary_df, batch_file_path_details, is_error =  write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, group_name, first_run=True, output_folder=output_folder)
                     # If error in table parsing, leave function
                     if is_error == True:
                     new_topic_summary_df = pd.concat([new_topic_summary_df, existing_topic_summary_df]).drop_duplicates('Subtopic')
+                    new_topic_summary_df["Group"] = group_name
                     new_topic_summary_df.to_csv(topic_summary_df_out_path, index=None)
                     out_file_paths.append(topic_summary_df_out_path)
                     # Write final output to text file also
                     try:
+                        final_table_output_path = output_folder + batch_file_path_details + "_full_final_response_" + clean_column_name(model_choice_clean, max_length = 20, front_characters=False) + "_temp_" + str(temperature) + ".txt"
+                        # if isinstance(responses[-1], ResponseObject):
+                        #     with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
+                        #         f.write(responses[-1].text)
+                        #     unique_table_df_display_table_markdown = responses[-1].text
+                        # elif "choices" in responses[-1]:
+                        #     with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
+                        #         f.write(responses[-1]["choices"][0]['text'])
+                        #     unique_table_df_display_table_markdown =responses[-1]["choices"][0]['text']
+                        # else:
+                        #     with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
+                        #         f.write(responses[-1].text)
+                        #     unique_table_df_display_table_markdown = responses[-1].text
+                        unique_table_df_display_table = new_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
+                        unique_table_df_display_table_markdown = unique_table_df_display_table[["General Topic", "Subtopic", "Sentiment", "Number of responses", "Summary"]].to_markdown(index=False)
                         log_files_output_paths.append(final_table_output_path)
         print("All summaries completed. Creating outputs.")
+        model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
         # Example usage
         in_column_cleaned = clean_column_name(chosen_cols, max_length=20)
         file_path_details = f"{file_name_cleaned}_col_{in_column_cleaned}"
         # Create a pivoted reference table
+        existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
         # Save the new DataFrame to CSV
+        reference_table_out_pivot_path = output_folder + file_path_details + "_final_reference_table_pivot_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
+        reference_table_out_path = output_folder + file_path_details + "_final_reference_table_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
+        topic_summary_df_out_path = output_folder + file_path_details + "_final_unique_topics_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
+        basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
         ## Reference table mapping response numbers to topics
         existing_reference_df.to_csv(reference_table_out_path, index=None)
         # Create final unique topics table from reference table to ensure consistent numbers
         final_out_topic_summary_df = create_topic_summary_df_from_reference_table(existing_reference_df)
+        final_out_topic_summary_df["Group"] = group_name
         ## Unique topic list
         final_out_topic_summary_df.to_csv(topic_summary_df_out_path, index=None, encoding='utf-8')
         out_file_paths.append(topic_summary_df_out_path)
+        # Outputs for markdown table output
+        unique_table_df_display_table = final_out_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
+        unique_table_df_display_table_markdown = unique_table_df_display_table[["General Topic", "Subtopic", "Sentiment", "Number of responses", "Summary", "Group"]].to_markdown(index=False)
         # Ensure that we are only returning the final results to outputs
         out_file_paths = [x for x in out_file_paths if '_final_' in x]
         ## Reference table mapping response numbers to topics
+        existing_reference_df_pivot["Group"] = group_name
         existing_reference_df_pivot.to_csv(reference_table_out_pivot_path, index = None, encoding='utf-8')
         log_files_output_paths.append(reference_table_out_pivot_path)
         ## Create a dataframe for missing response references:
         # Assuming existing_reference_df and file_data are already defined
+        # Simplify table to just responses column and the Response reference number
         basic_response_data = get_basic_response_data(file_data, chosen_cols)
         # Save simplified file data to log outputs
         pd.DataFrame(basic_response_data).to_csv(basic_response_data_out_path, index=None, encoding='utf-8')
         log_files_output_paths.append(basic_response_data_out_path)
         # Step 1: Identify missing references
         missing_references = basic_response_data[~basic_response_data['Reference'].astype(str).isin(existing_reference_df['Response References'].astype(str).unique())]
         # Display the new DataFrame
         #print("missing_df:", missing_df)
+        missing_df_out_path = output_folder + file_path_details + "_missing_references_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
         missing_df.to_csv(missing_df_out_path, index=None, encoding='utf-8')
         log_files_output_paths.append(missing_df_out_path)
         print("latest_batch_completed at end of batch iterations to return is", latest_batch_completed)
+        return unique_table_df_display_table_markdown, existing_topics_table, final_out_topic_summary_df, existing_reference_df, final_out_file_paths, final_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, final_out_file_paths, final_out_file_paths, modifiable_topic_summary_df, final_out_file_paths, join_file_paths, existing_reference_df_pivot # gr.Dataframe(value=modifiable_topic_summary_df, headers=None, col_count=(modifiable_topic_summary_df.shape[1], "fixed"), row_count = (modifiable_topic_summary_df.shape[0], "fixed"), visible=True, type="pandas"),
+    return unique_table_df_display_table_markdown, existing_topics_table, existing_topic_summary_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, modifiable_topic_summary_df, out_file_paths, join_file_paths, existing_reference_df_pivot # gr.Dataframe(value=modifiable_topic_summary_df, headers=None, col_count=(modifiable_topic_summary_df.shape[1], "fixed"), row_count = (modifiable_topic_summary_df.shape[0], "fixed"), visible=True, type="pandas"),
 def wrapper_extract_topics_per_column_value(
     selected_col: str,
     acc_topics_table = initial_existing_topics_table.copy()
     acc_reference_df = initial_existing_reference_df.copy()
     acc_topic_summary_df = initial_existing_topic_summary_df.copy()
+    acc_reference_df_pivot = pd.DataFrame()
     # Lists are extended
     acc_out_file_paths = []
     wrapper_first_loop = initial_first_loop_state
+    for i, group_value in tqdm(enumerate(unique_values), desc=f"Analysing by group", total=len(unique_values), unit="groups"):
         print(f"\nProcessing segment: {selected_col} = {group_value} ({i+1}/{len(unique_values)})")
         filtered_file_data = file_data.copy()
                 seg_gradio_df,
                 _seg_out_files5, # Often same as 1
                 seg_join_files,
+                seg_reference_df_pivot
             ) = extract_topics(
                 in_data_file=in_data_file,
                 file_data=filtered_file_data,
             # Aggregate results
             # The DFs returned by extract_topics are already cumulative for *its own run*.
             # We now make them cumulative for the *wrapper's run*.
+            acc_reference_df = pd.concat([acc_reference_df, seg_reference_df])
+            acc_topic_summary_df = pd.concat([acc_topic_summary_df, seg_topic_summary_df])
+            acc_reference_df_pivot = pd.concat([acc_reference_df_pivot, seg_reference_df_pivot])
             # For lists, extend. Use set to remove duplicates if paths might be re-added.
             acc_out_file_paths.extend(f for f in seg_out_files1 if f not in acc_out_file_paths)
             # Optionally, decide if you want to continue with other segments or stop
             # For now, it will continue
             continue
+    if "Group" in acc_reference_df.columns:
+        model_choice_clean = model_name_map[model_choice]
+        model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
+        overall_file_name = f"{clean_column_name(original_file_name, max_length=30)}_"
+        acc_reference_df_path = output_folder + overall_file_name + "all_reference_table_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
+        acc_topic_summary_df_path = output_folder + overall_file_name +  "all_unique_topics_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
+        acc_reference_df_pivot_path = output_folder + overall_file_name +  "all_reference_pivot_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
+        acc_reference_df.to_csv(acc_reference_df_path, index=None)
+        acc_topic_summary_df.to_csv(acc_topic_summary_df_path, index=None)
+        acc_reference_df_pivot.to_csv(acc_reference_df_pivot_path, index=None)
+        # Remove the existing output file list and replace with the updated concatenated outputs
+        substring_list_to_remove = ["_final_reference_table_pivot_", "_final_reference_table_", "_final_unique_topics_"]
+        acc_out_file_paths = [
+            x for x in acc_out_file_paths
+            if not any(sub in x for sub in substring_list_to_remove)
+        ]
+        acc_out_file_paths.extend([acc_reference_df_path, acc_topic_summary_df_path])
+        # Outputs for markdown table output
+        unique_table_df_display_table = acc_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
+        acc_markdown_output = unique_table_df_display_table[["General Topic", "Subtopic", "Sentiment", "Number of responses", "Summary", "Group"]].to_markdown(index=False)
     print(f"\nWrapper finished processing all segments. Total time: {acc_total_time_taken:.2f}s")

tools/prompts.py CHANGED Viewed

@@ -85,7 +85,7 @@ Your task is to summarise the above table in markdown format. {summary_format}.
 Summary:"""
-comprehensive_summary_format_prompt = "Return a comprehensive summary that covers all the important topics and themes described in the table"
 ### Verify exisiting categories prompt

 Summary:"""
+comprehensive_summary_format_prompt = "Return a comprehensive summary that covers all the important topics and themes described in the table. If there are different values in the Group column of the data, compare and contrast differences between the topics and themes from each Group."
 ### Verify exisiting categories prompt