Spaces:

seanpedrickcase
/

llm_topic_modelling

Running on Zero

App Files Files Community

seanpedrickcase commited on Jun 2

Commit

99b54b3

1 Parent(s): 7122331

Package updates. Can now ask the model to only assign specified topics

Browse files

Files changed (10) hide show

.dockerignore +1 -0
.gitignore +1 -0
app.py +53 -36
requirements.txt +2 -2
requirements_aws.txt +2 -2
requirements_gpu.txt +4 -3
tools/aws_functions.py +6 -1
tools/helper_functions.py +155 -12
tools/llm_api_call.py +202 -300
tools/prompts.py +12 -7

.dockerignore CHANGED Viewed

@@ -5,6 +5,7 @@
 *.ipynb
 *.xls
 *.xlsx
 examples/*
 output/*
 tools/__pycache__/*

 *.ipynb
 *.xls
 *.xlsx
+*.csv
 examples/*
 output/*
 tools/__pycache__/*

.gitignore CHANGED Viewed

@@ -5,6 +5,7 @@
 *.ipynb
 *.xls
 *.xlsx
 examples/*
 output/*
 tools/__pycache__/*

 *.ipynb
 *.xls
 *.xlsx
+*.csv
 examples/*
 output/*
 tools/__pycache__/*

app.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import os
 import socket
 import spaces
-from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL
 from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
 from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default, deduplicate_topics, modify_existing_output_tables
 from tools.auth import authenticate_user
-from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, verify_titles_prompt, verify_titles_system_prompt
 from tools.verify_titles import verify_titles
 #from tools.aws_functions import load_data_from_aws
 import gradio as gr
@@ -22,18 +22,14 @@ host_name = socket.gethostname()
 access_logs_data_folder = 'logs/' + today_rev + '/' + host_name + '/'
 feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
 usage_data_folder = 'usage/' + today_rev + '/' + host_name + '/'
-file_input_height = 150
-print("RUN_LOCAL_MODEL is:", RUN_LOCAL_MODEL)
 if RUN_LOCAL_MODEL == "1":
     default_model_choice = "gemma_2b_it_local"
 elif RUN_AWS_FUNCTIONS == "1":
     default_model_choice = "anthropic.claude-3-haiku-20240307-v1:0"
 else:
-    default_model_choice = "gemini-2.0-flash"
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Base())
@@ -95,7 +91,7 @@ with app:
     Instructions on use can be found in the README.md file. Try it out with this [dummy development consultation dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation), which you can also try with [zero-shot topics](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation/blob/main/example_zero_shot.csv), or this [dummy case notes dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_case_notes).
-    You can use an AWS Bedrock model (Claude 3, paid), or Gemini (a free API, but with strict limits for the Pro model). Due to the strict API limits for the best model (Pro 1.5), the use of Gemini requires an API key. To set up your own Gemini API key, go [here](https://aistudio.google.com/app/u/1/plan_information).
     NOTE: that **API calls to Gemini are not considered secure**, so please only submit redacted, non-sensitive tabular files to this source. Also, large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
@@ -107,7 +103,7 @@ with app:
         )
         with gr.Row():
             model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
-            in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
         with gr.Accordion("Upload xlsx or csv file", open = True):
             in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
@@ -116,12 +112,14 @@ with app:
         in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
         with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
-            candidate_topics = gr.File(height=file_input_height, label="Input topics from file (csv). File should have at least one column with a header, and all topic names below this. Using the headers 'General Topic' and/or 'Subtopic' will allow for these columns to be suggested to the model.")
-            force_zero_shot_radio = gr.Radio(label="Force responses into zero shot topics", value="No", choices=["Yes", "No"])
         context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
-        sentiment_checkbox = gr.Radio(label="Choose sentiment categories to split responses", value="Negative, Neutral, or Positive", choices=["Negative, Neutral, or Positive", "Negative or Positive", "Do not assess sentiment"])
         extract_topics_btn = gr.Button("Extract topics", variant="primary")
@@ -153,10 +151,7 @@ with app:
             save_modified_files_button = gr.Button(value="Save modified topic names")
-        with gr.Accordion("Upload reference data file and unique data files", open = True):
             ### DEDUPLICATION
             deduplication_input_files = gr.File(height=file_input_height, label="Upload files to deduplicate topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
             deduplication_input_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
@@ -168,11 +163,10 @@ with app:
             deduplicate_previous_data_btn = gr.Button("Deduplicate topics", variant="primary")
-            ### SUMMARISATION
             summarisation_input_files = gr.File(height=file_input_height, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
-            summarise_format_radio = gr.Radio(label="Choose summary type", value="Return a summary up to two paragraphs long that includes as much detail as possible from the original text", choices=["Return a summary up to two paragraphs long that includes as much detail as possible from the original text", "Return a concise summary up to one paragraph long that summarises only the most important themes from the original text"])
             summarise_previous_data_btn = gr.Button("Summarise topics", variant="primary")
             summary_output_files = gr.File(height=file_input_height, label="Summarised output files", interactive=False)
@@ -198,10 +192,10 @@ with app:
         in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
         view_table_markdown = gr.Markdown(value = "", label="View table", show_copy_button=True)
-    with gr.Tab(label="Verify titles"):
         gr.Markdown(
         """
-        ### Choose a tabular data file (xlsx or csv) with titles and original text to verify titles/descriptions for.
         """
         )
         with gr.Row():
@@ -212,11 +206,11 @@ with app:
             verify_in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
         verify_in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
-        verify_in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = True, label="Select the open text columns that have a response and a title. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
         #verify_title_colnames = gr.Dropdown(choices=["Choose column with titles"], multiselect = False, label="Select the open text columns that have a title. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
-        verify_titles_btn = gr.Button("Verify titles", variant="primary")
-        verify_titles_file_output = gr.File(height=file_input_height, label="Title verification output files")
         verify_display_topic_table_markdown = gr.Markdown(value="### Language model response will appear here", show_copy_button=True)
         verify_modification_input_files_placeholder = gr.File(height=file_input_height, label="Placeholder for files to avoid errors", visible=False)
@@ -231,7 +225,7 @@ with app:
             batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = batch_size_default, precision=0, minimum=1, maximum=100)
             random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
-        with gr.Accordion("Prompt settings", open = True):
             number_of_prompts = gr.Number(value=1, label="Number of prompts to send to LLM in sequence", minimum=1, maximum=3, visible=False)
             system_prompt_textbox = gr.Textbox(label="Initial system prompt", lines = 4, value = system_prompt)
             initial_table_prompt_textbox = gr.Textbox(label = "Initial topics prompt", lines = 8, value = initial_table_prompt)
@@ -241,9 +235,17 @@ with app:
             add_to_existing_topics_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = add_existing_topics_prompt)
             verify_titles_system_prompt_textbox = gr.Textbox(label="Additional topics system prompt", lines = 4, value = verify_titles_system_prompt)
             verify_titles_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = verify_titles_prompt)
-        log_files_output = gr.File(height=file_input_height, label="Log file output", interactive=False)
-        conversation_metadata_textbox = gr.Textbox(label="Query metadata - usage counts and other parameters", interactive=False, lines=8)
         # Invisible text box to hold the session hash/username just for logging purposes
         session_hash_textbox = gr.Textbox(label = "Session hash", value="", visible=False)
@@ -271,25 +273,28 @@ with app:
     ###
     # Tabular data upload
-    in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, reference_data_file_name_textbox])
     extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown]).\
     success(load_in_data_file,
         inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
     success(fn=extract_topics,
-        inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio, in_excel_sheets],
-        outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files], api_name="extract_topics")
     # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
     # latest_batch_completed.change(fn=extract_topics,
     #     inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio, in_excel_sheets],
-    #     outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files]).\
     #     success(fn = reveal_feedback_buttons,
     #         outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
     # If you upload data into the deduplication input box, the modifiable topic dataframe box is updated
     modification_input_files.change(fn=load_in_previous_data_files, inputs=[modification_input_files, modified_unique_table_change_bool], outputs=[modifiable_unique_topics_df_state, master_modify_reference_df_state, master_modify_unique_topics_df_state, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, text_output_modify_file_list_state])
     # Modify output table with custom topic names
@@ -314,17 +319,29 @@ with app:
             load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches]).\
             success(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, reference_data_file_name_textbox])
-    # VERIFY TITLES OR DESCRIPTIONS OF TEXT
     # Tabular data upload
-    verify_in_data_files.upload(fn=put_columns_in_df, inputs=[verify_in_data_files], outputs=[verify_in_colnames, verify_in_excel_sheets, reference_data_file_name_textbox])
     verify_titles_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown]).\
     success(load_in_data_file,
         inputs = [verify_in_data_files, verify_in_colnames, batch_size_number, verify_in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="verify_load_data").\
     success(fn=verify_titles,
         inputs=[verify_in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, verify_in_api_key, temperature_slide, verify_in_colnames, verify_model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, verify_titles_prompt_textbox, prompt_2_textbox, prompt_3_textbox, verify_titles_system_prompt_textbox, verify_titles_system_prompt_textbox, verify_titles_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio, in_excel_sheets],
-        outputs=[verify_display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, verify_titles_file_output, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, verify_modification_input_files_placeholder], api_name="verify_titles")
     ###
     # LOGGING AND ON APP LOAD FUNCTIONS

 import os
 import socket
 import spaces
+from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL, load_in_previous_reference_file, join_cols_onto_reference_df, GEMINI_API_KEY
 from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
 from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default, deduplicate_topics, modify_existing_output_tables
 from tools.auth import authenticate_user
+from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, verify_titles_prompt, verify_titles_system_prompt, two_para_summary_format_prompt, single_para_summary_format_prompt
 from tools.verify_titles import verify_titles
 #from tools.aws_functions import load_data_from_aws
 import gradio as gr
 access_logs_data_folder = 'logs/' + today_rev + '/' + host_name + '/'
 feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
 usage_data_folder = 'usage/' + today_rev + '/' + host_name + '/'
+file_input_height = 200
 if RUN_LOCAL_MODEL == "1":
     default_model_choice = "gemma_2b_it_local"
 elif RUN_AWS_FUNCTIONS == "1":
     default_model_choice = "anthropic.claude-3-haiku-20240307-v1:0"
 else:
+    default_model_choice = "gemini-2.0-flash-001"
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Base())
     Instructions on use can be found in the README.md file. Try it out with this [dummy development consultation dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation), which you can also try with [zero-shot topics](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation/blob/main/example_zero_shot.csv), or this [dummy case notes dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_case_notes).
+    You can use an AWS Bedrock model (Claude 3, paid), or Gemini (a free API, but with strict limits for the Pro model). The use of Gemini models requires an API key. To set up your own Gemini API key, go [here](https://aistudio.google.com/app/u/1/plan_information).
     NOTE: that **API calls to Gemini are not considered secure**, so please only submit redacted, non-sensitive tabular files to this source. Also, large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
         )
         with gr.Row():
             model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
+            in_api_key = gr.Textbox(value = GEMINI_API_KEY, label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
         with gr.Accordion("Upload xlsx or csv file", open = True):
             in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
         in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
         with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
+            candidate_topics = gr.File(height=file_input_height, label="Input topics from file (csv). File should have at least one column with a header, and all topic names below this. Using the headers 'General Topic' and/or 'Subtopic' will allow for these columns to be suggested to the model. If a third column is present, it will be assumed to be a topic description.")
+            with gr.Row(equal_height=True):
+                force_zero_shot_radio = gr.Radio(label="Force responses into zero shot topics", value="No", choices=["Yes", "No"])
+                force_single_topic_radio = gr.Radio(label="Ask the model to assign responses to only a single topic", value="No", choices=["Yes", "No"])
         context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
+        sentiment_checkbox = gr.Radio(label="Choose sentiment categories to split responses", value="Negative or Positive", choices=["Negative or Positive", "Negative, Neutral, or Positive", "Do not assess sentiment"])
         extract_topics_btn = gr.Button("Extract topics", variant="primary")
             save_modified_files_button = gr.Button(value="Save modified topic names")
+        with gr.Accordion("Upload reference data file and unique data files", open = True):
             ### DEDUPLICATION
             deduplication_input_files = gr.File(height=file_input_height, label="Upload files to deduplicate topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
             deduplication_input_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
             deduplicate_previous_data_btn = gr.Button("Deduplicate topics", variant="primary")
+            ### SUMMARISATION
             summarisation_input_files = gr.File(height=file_input_height, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
+            summarise_format_radio = gr.Radio(label="Choose summary type", value=two_para_summary_format_prompt, choices=[two_para_summary_format_prompt, single_para_summary_format_prompt])
             summarise_previous_data_btn = gr.Button("Summarise topics", variant="primary")
             summary_output_files = gr.File(height=file_input_height, label="Summarised output files", interactive=False)
         in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
         view_table_markdown = gr.Markdown(value = "", label="View table", show_copy_button=True)
+    with gr.Tab(label="Verify descriptions"):
         gr.Markdown(
         """
+        ### Choose a tabular data file (xlsx or csv) with titles and original text to verify descriptions for.
         """
         )
         with gr.Row():
             verify_in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
         verify_in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
+        verify_in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = True, label="Select the open text columns that have a response and a title/description. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
         #verify_title_colnames = gr.Dropdown(choices=["Choose column with titles"], multiselect = False, label="Select the open text columns that have a title. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
+        verify_titles_btn = gr.Button("Verify descriptions", variant="primary")
+        verify_titles_file_output = gr.File(height=file_input_height, label="Descriptions verification output files")
         verify_display_topic_table_markdown = gr.Markdown(value="### Language model response will appear here", show_copy_button=True)
         verify_modification_input_files_placeholder = gr.File(height=file_input_height, label="Placeholder for files to avoid errors", visible=False)
             batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = batch_size_default, precision=0, minimum=1, maximum=100)
             random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
+        with gr.Accordion("Prompt settings", open = False):
             number_of_prompts = gr.Number(value=1, label="Number of prompts to send to LLM in sequence", minimum=1, maximum=3, visible=False)
             system_prompt_textbox = gr.Textbox(label="Initial system prompt", lines = 4, value = system_prompt)
             initial_table_prompt_textbox = gr.Textbox(label = "Initial topics prompt", lines = 8, value = initial_table_prompt)
             add_to_existing_topics_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = add_existing_topics_prompt)
             verify_titles_system_prompt_textbox = gr.Textbox(label="Additional topics system prompt", lines = 4, value = verify_titles_system_prompt)
             verify_titles_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = verify_titles_prompt)
+        with gr.Accordion("Join additional columns to reference file outputs", open = False):
+            join_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = True, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
+            with gr.Row():
+                in_join_files = gr.File(height=file_input_height, label="Reference file should go here. Original data file should be loaded on the first tab.")
+                join_cols_btn = gr.Button("Join columns to reference output", variant="primary")
+            out_join_files = gr.File(height=file_input_height, label="Output joined reference files will go here.")
+        with gr.Accordion("Logging outputs", open = False):
+            log_files_output = gr.File(height=file_input_height, label="Log file output", interactive=False)
+            conversation_metadata_textbox = gr.Textbox(label="Query metadata - usage counts and other parameters", interactive=False, lines=8)
         # Invisible text box to hold the session hash/username just for logging purposes
         session_hash_textbox = gr.Textbox(label = "Session hash", value="", visible=False)
     ###
     # Tabular data upload
+    in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, reference_data_file_name_textbox, join_colnames])
     extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown]).\
     success(load_in_data_file,
         inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
     success(fn=extract_topics,
+        inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio, in_excel_sheets, force_single_topic_radio],
+        outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files, in_join_files], api_name="extract_topics")
     # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
     # latest_batch_completed.change(fn=extract_topics,
     #     inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio, in_excel_sheets],
+    #     outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files, in_join_files]).\
     #     success(fn = reveal_feedback_buttons,
     #         outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
     # If you upload data into the deduplication input box, the modifiable topic dataframe box is updated
     modification_input_files.change(fn=load_in_previous_data_files, inputs=[modification_input_files, modified_unique_table_change_bool], outputs=[modifiable_unique_topics_df_state, master_modify_reference_df_state, master_modify_unique_topics_df_state, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, text_output_modify_file_list_state])
     # Modify output table with custom topic names
             load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches]).\
             success(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, reference_data_file_name_textbox])
+    # VERIFY DESCRIPTIONS OF TEXT
     # Tabular data upload
+    verify_in_data_files.upload(fn=put_columns_in_df, inputs=[verify_in_data_files], outputs=[verify_in_colnames, verify_in_excel_sheets, reference_data_file_name_textbox, join_colnames])
     verify_titles_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown]).\
     success(load_in_data_file,
         inputs = [verify_in_data_files, verify_in_colnames, batch_size_number, verify_in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="verify_load_data").\
     success(fn=verify_titles,
         inputs=[verify_in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, verify_in_api_key, temperature_slide, verify_in_colnames, verify_model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, verify_titles_prompt_textbox, prompt_2_textbox, prompt_3_textbox, verify_titles_system_prompt_textbox, verify_titles_system_prompt_textbox, verify_titles_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio, in_excel_sheets],
+        outputs=[verify_display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, verify_titles_file_output, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, verify_modification_input_files_placeholder], api_name="verify_descriptions")
+    ###
+    #  LLM SETTINGS PAGE
+    ###
+    reference_df_data_file_name_textbox = gr.Textbox(label="reference_df_data_file_name_textbox", visible=False)
+    master_reference_df_state_joined = gr.State(pd.DataFrame())
+    join_cols_btn.click(fn=load_in_previous_reference_file, inputs=[in_join_files], outputs=[master_reference_df_state, reference_df_data_file_name_textbox]).\
+    success(load_in_data_file,
+        inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches]).\
+    success(fn=join_cols_onto_reference_df, inputs=[master_reference_df_state, file_data_state, join_colnames, reference_df_data_file_name_textbox], outputs=[master_reference_df_state_joined, out_join_files])
     ###
     # LOGGING AND ON APP LOAD FUNCTIONS

requirements.txt CHANGED Viewed

@@ -1,7 +1,7 @@
 pandas==2.2.3
-gradio==5.23.3
 spaces==0.34.1
-boto3==1.37.29
 pyarrow==19.0.1
 openpyxl==3.1.3
 markdown==3.7

 pandas==2.2.3
+gradio==5.32.0
 spaces==0.34.1
+boto3==1.38.5
 pyarrow==19.0.1
 openpyxl==3.1.3
 markdown==3.7

requirements_aws.txt CHANGED Viewed

@@ -1,7 +1,7 @@
 pandas==2.2.3
-gradio==5.23.3
 spaces==0.34.1
-boto3==1.37.29
 pyarrow==19.0.1
 openpyxl==3.1.3
 markdown==3.7

 pandas==2.2.3
+gradio==5.32.0
 spaces==0.34.1
+boto3==1.38.5
 pyarrow==19.0.1
 openpyxl==3.1.3
 markdown==3.7

requirements_gpu.txt CHANGED Viewed

@@ -1,7 +1,7 @@
 pandas==2.2.3
-gradio==5.23.3
 spaces==0.34.1
-boto3==1.37.29
 pyarrow==19.0.1
 openpyxl==3.1.3
 markdown==3.7
@@ -14,7 +14,8 @@ rapidfuzz==3.10.1
 torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu121
 #llama-cpp-python==0.3.4 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
 # Specify exact llama_cpp wheel for huggingface compatibility
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu121/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl
 transformers==4.51.1
 numpy==1.26.4
 typing_extensions==4.12.2

 pandas==2.2.3
+gradio==5.32.0
 spaces==0.34.1
+boto3==1.38.5
 pyarrow==19.0.1
 openpyxl==3.1.3
 markdown==3.7
 torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu121
 #llama-cpp-python==0.3.4 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
 # Specify exact llama_cpp wheel for huggingface compatibility
+#https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu121/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu121/llama_cpp_python-0.3.4-cp311-cp311-win_amd64.whl # Windows
 transformers==4.51.1
 numpy==1.26.4
 typing_extensions==4.12.2

tools/aws_functions.py CHANGED Viewed

@@ -13,9 +13,14 @@ bucket_name=""
 AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
 print(f'The value of AWS_REGION is {AWS_REGION}')
 if RUN_AWS_FUNCTIONS == "1":
     try:
-        bucket_name = os.environ['CONSULTATION_SUMMARY_BUCKET']
         session = boto3.Session() # profile_name="default"
     except Exception as e:
         print(e)

 AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
 print(f'The value of AWS_REGION is {AWS_REGION}')
+CONSULTATION_SUMMARY_BUCKET = get_or_create_env_var('CONSULTATION_SUMMARY_BUCKET', '')
+print(f'The value of AWS_REGION is {CONSULTATION_SUMMARY_BUCKET}')
 if RUN_AWS_FUNCTIONS == "1":
     try:
+        bucket_name = CONSULTATION_SUMMARY_BUCKET
         session = boto3.Session() # profile_name="default"
     except Exception as e:
         print(e)

tools/helper_functions.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import os
 import gradio as gr
 import pandas as pd
 def empty_output_vars_extract_topics():
     # Empty output objects before processing a new file
@@ -46,22 +49,35 @@ def get_or_create_env_var(var_name, default_value):
     return value
-RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
 print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
 RUN_LOCAL_MODEL = get_or_create_env_var("RUN_LOCAL_MODEL", "1")
 print(f'The value of RUN_LOCAL_MODEL is {RUN_LOCAL_MODEL}')
 if RUN_AWS_FUNCTIONS == "1":
-    model_full_names = ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0", "gemini-2.0-flash", "gemini-1.5-pro-002", "gemma_2b_it_local"]
-    model_short_names = ["haiku", "sonnet", "gemini_flash", "gemini_pro", "gemma_local"]
-else:
-    model_full_names = ["gemini-2.0-flash", "gemini-1.5-pro-002", "gemma_2b_it_local"]
-    model_short_names = ["gemini_flash", "gemini_pro", "gemma_local"]
-if RUN_LOCAL_MODEL == "0":
-    model_full_names.remove("gemma_2b_it_local")
-    model_short_names.remove("gemma_local")
 model_name_map = {short: full for short, full in zip(model_full_names, model_short_names)}
@@ -123,6 +139,113 @@ def read_file(filename:str, sheet:str=""):
     elif file_type == 'parquet':
         return pd.read_parquet(filename)
 # Wrap text in each column to the specified max width, including whole words
 def wrap_text(text:str, max_width=60, max_text_length=None):
     if not isinstance(text, str):
@@ -209,6 +332,26 @@ def wrap_text(text:str, max_width=60, max_text_length=None):
     return '<br>'.join(wrapped_lines)
 def view_table(file_path: str):  # Added max_width parameter
     df = pd.read_csv(file_path)
@@ -234,7 +377,7 @@ def ensure_output_folder_exists():
     else:
         print(f"The 'output/' folder already exists.")
-def put_columns_in_df(in_file):
     new_choices = []
     concat_choices = []
     all_sheet_names = []
@@ -272,9 +415,9 @@ def put_columns_in_df(in_file):
     concat_choices = sorted(set(concat_choices))
     if number_of_excel_files > 0:
-        return gr.Dropdown(choices=concat_choices, value=concat_choices[0]), gr.Dropdown(choices=all_sheet_names, value=all_sheet_names[0], visible=True, interactive=True), file_end
     else:
-        return gr.Dropdown(choices=concat_choices, value=concat_choices[0]), gr.Dropdown(visible=False), file_end
 # Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
 def add_folder_to_path(folder_path: str):

 import os
+import re
 import gradio as gr
 import pandas as pd
+from typing import List
+import math
 def empty_output_vars_extract_topics():
     # Empty output objects before processing a new file
     return value
+RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "1")
 print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
 RUN_LOCAL_MODEL = get_or_create_env_var("RUN_LOCAL_MODEL", "1")
 print(f'The value of RUN_LOCAL_MODEL is {RUN_LOCAL_MODEL}')
+RUN_GEMINI_MODELS = get_or_create_env_var("RUN_GEMINI_MODELS", "1")
+print(f'The value of RUN_GEMINI_MODELS is {RUN_GEMINI_MODELS}')
+GEMINI_API_KEY = get_or_create_env_var('GEMINI_API_KEY', '')
+# Build up options for models
+model_full_names = []
+model_short_names = []
+if RUN_LOCAL_MODEL == "1":
+    model_full_names.append("gemma_2b_it_local")
+    model_short_names.append("gemma_local")
 if RUN_AWS_FUNCTIONS == "1":
+    model_full_names.extend(["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"])
+    model_short_names.extend(["haiku", "sonnet"])
+if RUN_GEMINI_MODELS == "1":
+    model_full_names.extend(["gemini-2.0-flash-001", "gemini-2.5-flash-preview-05-20", "gemini-2.5-pro-exp-05-06" ]) # , # Gemini pro No longer available on free tier
+    model_short_names.extend(["gemini_flash_2", "gemini_flash_2.5", "gemini_pro"])
+print("model_short_names:", model_short_names)
+print("model_full_names:", model_full_names)
 model_name_map = {short: full for short, full in zip(model_full_names, model_short_names)}
     elif file_type == 'parquet':
         return pd.read_parquet(filename)
+def load_in_file(file_path: str, colnames:List[str]="", excel_sheet:str=""):
+    """
+    Loads in a tabular data file and returns data and file name.
+    Parameters:
+    - file_path (str): The path to the file to be processed.
+    - colnames (List[str], optional): list of colnames to load in
+    """
+    #file_type = detect_file_type(file_path)
+    #print("File type is:", file_type)
+    file_name = get_file_name_no_ext(file_path)
+    file_data = read_file(file_path, excel_sheet)
+    if colnames and isinstance(colnames, list):
+        col_list = colnames
+    else:
+        col_list = list(file_data.columns)
+    if not isinstance(col_list, List):
+        col_list = [col_list]
+    col_list = [item for item in col_list if item not in ["", "NA"]]
+    for col in col_list:
+        file_data[col] = file_data[col].fillna("")
+        file_data[col] = file_data[col].astype(str).str.replace("\bnan\b", "", regex=True)
+        #print(file_data[colnames])
+    return file_data, file_name
+def load_in_data_file(file_paths:List[str], in_colnames:List[str], batch_size:int=50, in_excel_sheets:str=""):
+    '''Load in data table, work out how many batches needed.'''
+    if not isinstance(in_colnames, list):
+        in_colnames = [in_colnames]
+    #print("in_colnames:", in_colnames)
+    try:
+        file_data, file_name = load_in_file(file_paths[0], colnames=in_colnames, excel_sheet=in_excel_sheets)
+        num_batches = math.ceil(len(file_data) / batch_size)
+        print("Total number of batches:", num_batches)
+    except Exception as e:
+        print(e)
+        file_data = pd.DataFrame()
+        file_name = ""
+        num_batches = 1
+    return file_data, file_name, num_batches
+def load_in_previous_reference_file(file:str):
+    '''Load in data table from a partially completed consultation summary to continue it.'''
+    reference_file_data = pd.DataFrame()
+    reference_file_name = ""
+    out_message = ""
+    #for file in file_paths:
+    print("file:", file)
+    # If reference table
+    if 'reference_table' in file:
+        try:
+            reference_file_data, reference_file_name = load_in_file(file)
+            #print("reference_file_data:", reference_file_data.head(2))
+            out_message = out_message + " Reference file load successful."
+        except Exception as e:
+            out_message = "Could not load reference file data:" + str(e)
+            raise Exception("Could not load reference file data:", e)
+    if reference_file_data.empty:
+        out_message = out_message + " No reference data table provided."
+        raise Exception(out_message)
+    print(out_message)
+    return reference_file_data, reference_file_name
+def join_cols_onto_reference_df(reference_df:pd.DataFrame, original_data_df:pd.DataFrame, join_columns:List[str], original_file_name:str, output_folder:str=output_folder):
+    #print("original_data_df columns:", original_data_df.columns)
+    #print("original_data_df:", original_data_df)
+    original_data_df.reset_index(names="Response References", inplace=True)
+    original_data_df["Response References"] += 1
+    #print("reference_df columns:", reference_df.columns)
+    #print("reference_df:", reference_df)
+    join_columns.append("Response References")
+    reference_df["Response References"] = reference_df["Response References"].fillna("-1").astype(int)
+    save_file_name = output_folder + original_file_name + "_j.csv"
+    out_reference_df = reference_df.merge(original_data_df[join_columns], on = "Response References", how="left")
+    out_reference_df.to_csv(save_file_name, index=None)
+    file_data_outputs = [save_file_name]
+    return out_reference_df, file_data_outputs
 # Wrap text in each column to the specified max width, including whole words
 def wrap_text(text:str, max_width=60, max_text_length=None):
     if not isinstance(text, str):
     return '<br>'.join(wrapped_lines)
+def initial_clean(text):
+    #### Some of my cleaning functions
+    html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
+    html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
+    non_ascii_pattern = r'[^\x00-\x7F]+'
+    multiple_spaces_regex = r'\s{2,}'
+    # Define a list of patterns and their replacements
+    patterns = [
+        (html_pattern_regex, ' '),
+        (html_start_pattern_end_dots_regex, ' '),
+        (non_ascii_pattern, ' '),
+        (multiple_spaces_regex, ' ')
+    ]
+    # Apply each regex replacement
+    for pattern, replacement in patterns:
+        text = re.sub(pattern, replacement, text)
+    return text
 def view_table(file_path: str):  # Added max_width parameter
     df = pd.read_csv(file_path)
     else:
         print(f"The 'output/' folder already exists.")
+def put_columns_in_df(in_file:List[str]):
     new_choices = []
     concat_choices = []
     all_sheet_names = []
     concat_choices = sorted(set(concat_choices))
     if number_of_excel_files > 0:
+        return gr.Dropdown(choices=concat_choices, value=concat_choices[0]), gr.Dropdown(choices=all_sheet_names, value=all_sheet_names[0], visible=True, interactive=True), file_end, gr.Dropdown(choices=concat_choices)
     else:
+        return gr.Dropdown(choices=concat_choices, value=concat_choices[0]), gr.Dropdown(visible=False), file_end, gr.Dropdown(choices=concat_choices)
 # Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
 def add_folder_to_path(folder_path: str):

tools/llm_api_call.py CHANGED Viewed

@@ -19,8 +19,8 @@ from io import StringIO
 GradioFileData = gr.FileData
-from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt, force_existing_topics_prompt, allow_new_topics_prompt
-from tools.helper_functions import output_folder, detect_file_type, get_file_name_no_ext, read_file, get_or_create_env_var, model_name_map, put_columns_in_df, wrap_text
 from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
 # ResponseObject class for AWS Bedrock calls
@@ -59,62 +59,6 @@ def normalise_string(text):
     return text
-def load_in_file(file_path: str, colnames:List[str]="", excel_sheet:str=""):
-    """
-    Loads in a tabular data file and returns data and file name.
-    Parameters:
-    - file_path (str): The path to the file to be processed.
-    - colnames (List[str], optional): list of colnames to load in
-    """
-    file_type = detect_file_type(file_path)
-    #print("File type is:", file_type)
-    file_name = get_file_name_no_ext(file_path)
-    file_data = read_file(file_path, excel_sheet)
-    print("colnames:", colnames)
-    if colnames and isinstance(colnames, list):
-        col_list = colnames
-    else:
-        col_list = list(file_data.columns)
-    if not isinstance(col_list, List):
-        col_list = [col_list]
-    col_list = [item for item in col_list if item not in ["", "NA"]]
-    for col in col_list:
-        file_data[col] = file_data[col].fillna("")
-        file_data[col] = file_data[col].astype(str).str.replace("\bnan\b", "", regex=True)
-        #print(file_data[colnames])
-    return file_data, file_name
-def load_in_data_file(file_paths:List[str], in_colnames:List[str], batch_size:int=50, in_excel_sheets:str=""):
-    '''Load in data table, work out how many batches needed.'''
-    if not isinstance(in_colnames, list):
-        in_colnames = [in_colnames]
-    print("in_colnames:", in_colnames)
-    try:
-        file_data, file_name = load_in_file(file_paths[0], colnames=in_colnames, excel_sheet=in_excel_sheets)
-        num_batches = math.ceil(len(file_data) / batch_size)
-        print("Total number of batches:", num_batches)
-    except Exception as e:
-        print(e)
-        file_data = pd.DataFrame()
-        file_name = ""
-        num_batches = 1
-    return file_data, file_name, num_batches
 def load_in_previous_data_files(file_paths_partial_output:List[str], for_modified_table:bool=False):
     '''Load in data table from a partially completed consultation summary to continue it.'''
@@ -186,6 +130,7 @@ def load_in_previous_data_files(file_paths_partial_output:List[str], for_modifie
         return gr.Dataframe(value=unique_file_data, headers=None, col_count=(unique_file_data.shape[1], "fixed"), row_count = (unique_file_data.shape[0], "fixed"), visible=True, type="pandas"), reference_file_data, unique_file_data, reference_file_name, unique_file_name, out_file_names
 def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str], verify_titles:bool=False) -> pd.DataFrame:
     if not isinstance(chosen_cols, list):
@@ -199,10 +144,12 @@ def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str], verif
     if verify_titles == True:
         basic_response_data = basic_response_data.rename(columns={chosen_cols[0]: "Response", chosen_cols[1]: "Title"})
         basic_response_data["Title"] = basic_response_data["Title"].str.strip()
     else:
         basic_response_data = basic_response_data.rename(columns={chosen_cols[0]: "Response"})
     basic_response_data["Response"] = basic_response_data["Response"].str.strip()
     return basic_response_data
@@ -245,12 +192,12 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
     else:
         end_row = file_len + 1
-    print("start_row:", start_row)
-    print("end_row:", end_row)
     batch_basic_response_data = basic_response_data[start_row:end_row]  # Select the current batch
-    print("batch_basic_response_data:", batch_basic_response_data)
     # Now replace the reference numbers with numbers starting from 1
     batch_basic_response_data.loc[:, "Reference"] = batch_basic_response_data["Reference"] - start_row
@@ -398,7 +345,7 @@ def call_aws_claude(prompt: str, system_prompt: str, temperature: float, max_tok
     # Now you can access both the text and metadata
     #print("Text:", response.text)
-    print("Metadata:", response.usage_metadata)
     #print("Text:", response.text)
     return response
@@ -428,7 +375,7 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
     progress_bar = range(0,number_of_api_retry_attempts)
     # Generate the model's response
-    if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
         for i in progress_bar:
             try:
@@ -451,7 +398,7 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
             if i == number_of_api_retry_attempts:
                 return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
-    elif model_choice in ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]:
         for i in progress_bar:
             try:
                 print("Calling AWS Claude model, attempt", i + 1)
@@ -661,70 +608,6 @@ def clean_markdown_table(text: str):
     return result
-# def clean_markdown_table(text: str):
-#     lines = text.splitlines()
-#     # Remove any empty rows or rows with only pipes
-#     cleaned_lines = [line for line in lines if not re.match(r'^\s*\|?\s*\|?\s*$', line)]
-#     # Merge lines that belong to the same row (i.e., don't start with |)
-#     merged_lines = []
-#     buffer = ""
-#     for line in cleaned_lines:
-#         if line.lstrip().startswith('|'):  # If line starts with |, it's a new row
-#             if buffer:
-#                 merged_lines.append(buffer)  # Append the buffered content
-#             buffer = line  # Start a new buffer with this row
-#         else:
-#             # Continuation of the previous row
-#             buffer += ' ' + line.strip()  # Add content to the current buffer
-#     # Don't forget to append the last buffer
-#     if buffer:
-#         merged_lines.append(buffer)
-#     # Fix the header separator row if necessary
-#     if len(merged_lines) > 1:
-#         header_pipes = merged_lines[0].count('|')  # Count pipes in the header row
-#         header_separator = '|---|' * (header_pipes - 1) + '|---|'  # Generate proper separator
-#         # Replace or insert the separator row
-#         if not re.match(r'^\|[-:|]+$', merged_lines[1]):  # Check if the second row is a valid separator
-#             merged_lines.insert(1, header_separator)
-#         else:
-#             # Adjust the separator to match the header pipes
-#             merged_lines[1] = '|---|' * (header_pipes - 1) + '|'
-#     # Ensure consistent number of pipes in each row
-#     result = []
-#     header_pipes = merged_lines[0].count('|')  # Use the header row to count the number of pipes
-#     for line in merged_lines:
-#         # Strip excessive whitespace around pipes
-#         line = re.sub(r'\s*\|\s*', '|', line.strip())
-#         # Fix inconsistent number of pipes by adjusting them to match the header
-#         pipe_count = line.count('|')
-#         if pipe_count < header_pipes:
-#             line += '|' * (header_pipes - pipe_count)  # Add missing pipes
-#         elif pipe_count > header_pipes:
-#             # If too many pipes, split line and keep the first `header_pipes` columns
-#             columns = line.split('|')[:header_pipes + 1]  # +1 to keep last pipe at the end
-#             line = '|'.join(columns)
-#         line = re.sub(r'(\d),(?=\d)', r'\1, ', line)
-#         result.append(line)
-#     # Join lines back into the cleaned markdown text
-#     cleaned_text = '\n'.join(result)
-#     # Replace numbers next to commas and other numbers with a space
-#     return cleaned_text
 def clean_column_name(column_name, max_length=20):
     # Convert to string
     column_name = str(column_name)
@@ -751,31 +634,6 @@ def create_unique_table_df_from_reference_table(reference_df:pd.DataFrame):
             .assign(Topic_number=lambda df: np.arange(1, len(df) + 1))  # Add numbering 1 to x
         )
-    # new_unique_topics_df = reference_df[["General Topic", "Subtopic", "Sentiment"]]
-    # new_unique_topics_df = new_unique_topics_df.rename(columns={new_unique_topics_df.columns[0]: "General Topic", new_unique_topics_df.columns[1]: "Subtopic", new_unique_topics_df.columns[2]: "Sentiment"})
-    # # Join existing and new unique topics
-    # out_unique_topics_df = new_unique_topics_df
-    # out_unique_topics_df = out_unique_topics_df.rename(columns={out_unique_topics_df.columns[0]: "General Topic", out_unique_topics_df.columns[1]: "Subtopic", out_unique_topics_df.columns[2]: "Sentiment"})
-    # #print("out_unique_topics_df:", out_unique_topics_df)
-    # out_unique_topics_df = out_unique_topics_df.drop_duplicates(["General Topic", "Subtopic", "Sentiment"]).\
-    #         drop(["Response References", "Summary"], axis = 1, errors="ignore")
-    # # Get count of rows that refer to particular topics
-    # reference_counts = reference_df.groupby(["General Topic", "Subtopic", "Sentiment"]).agg({
-    # 'Response References': 'size',  # Count the number of references
-    # 'Summary': lambda x: '<br>'.join(
-    #     sorted(set(x), key=lambda summary: reference_df.loc[reference_df['Summary'] == summary, 'Start row of group'].min())
-    # )
-    # }).reset_index()
-    # # Join the counts to existing_unique_topics_df
-    # out_unique_topics_df = out_unique_topics_df.merge(reference_counts, how='left', on=["General Topic", "Subtopic", "Sentiment"]).sort_values("Response References", ascending=False)
     return out_unique_topics_df
 # Convert output table to markdown and then to a pandas dataframe to csv
@@ -933,8 +791,6 @@ def call_llm_with_markdown_table_checks(batch_prompts: List[str],
             call_temperature, reported_batch_no, local_model, master=master
         )
-        print("Responses:", responses)
         if (model_choice != "gemma_local") & (model_choice != "gemma_2b_it_local"):
             stripped_response = responses[-1].text.strip()
         else:
@@ -1041,7 +897,16 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
         return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
     # Rename columns to ensure consistent use of data frames later in code
-    topic_with_response_df.columns = ["General Topic", "Subtopic", "Sentiment", "Response References", "Summary"]
     # Fill in NA rows with values from above (topics seem to be included only on one row):
     topic_with_response_df = topic_with_response_df.ffill()
@@ -1073,8 +938,8 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
         sentiment = row.iloc[2] if pd.notna(row.iloc[2]) else ""
         summary = row.iloc[4] if pd.notna(row.iloc[4]) else ""
         # If the reference response column is very long, and there's nothing in the summary column, assume that the summary was put in the reference column
-        if not summary and len(row.iloc[3] > 30):
-            summary = row.iloc[3]
         summary = row_number_string_start + summary
@@ -1151,6 +1016,128 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
     return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
 @spaces.GPU
 def extract_topics(in_data_file,
               file_data:pd.DataFrame,
@@ -1184,6 +1171,8 @@ def extract_topics(in_data_file,
               sentiment_checkbox:str = "Negative, Neutral, or Positive",
               force_zero_shot_radio:str = "No",
               in_excel_sheets:List[str] = [],
               max_tokens:int=max_tokens,
               model_name_map:dict=model_name_map,
               max_time_for_loop:int=max_time_for_loop,
@@ -1224,7 +1213,9 @@ def extract_topics(in_data_file,
     - time_taken (float, optional): The amount of time taken to process the responses up until this point.
     - sentiment_checkbox (str, optional): What type of sentiment analysis should the topic modeller do?
     - force_zero_shot_radio (str, optional): Should responses be forced into a zero shot topic or not.
-    - in_excel_sheets (List[str], optional): List of excel sheets to load from input file
     - max_tokens (int): The maximum number of tokens for the model.
     - model_name_map (dict, optional): A dictionary mapping full model name to shortened.
     - max_time_for_loop (int, optional): The number of seconds maximum that the function should run for before breaking (to run again, this is to avoid timeouts with some AWS services if deployed there).
@@ -1254,17 +1245,13 @@ def extract_topics(in_data_file,
     if file_data.empty:
         print("No data table found, loading from file")
         try:
-            #print("in_data_file:", in_data_file)
             in_colnames_drop, in_excel_sheets, file_name = put_columns_in_df(in_data_file)
-            #print("in_colnames:", in_colnames_drop)
             file_data, file_name, num_batches = load_in_data_file(in_data_file, chosen_cols, batch_size_default, in_excel_sheets)
-            #print("file_data loaded in:", file_data)
         except:
             # Check if files and text exist
             out_message = "Please enter a data file to summarise."
             print(out_message)
             raise Exception(out_message)
-            #return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths#, out_message
     #model_choice_clean = replace_punctuation_with_underscore(model_choice)
@@ -1277,12 +1264,10 @@ def extract_topics(in_data_file,
             latest_batch_completed = 0
             out_message = []
             out_file_paths = []
-            #print("model_choice_clean:", model_choice_clean)
             if (model_choice == "gemma_2b_it_local") & (RUN_LOCAL_MODEL == "1"):
                 progress(0.1, "Loading in Gemma 2b model")
                 local_model, tokenizer = load_model()
-                print("Local model loaded:", local_model)
     if num_batches > 0:
         progress_measure = round(latest_batch_completed / num_batches, 1)
@@ -1301,12 +1286,10 @@ def extract_topics(in_data_file,
             out_file_paths = []
-        if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
             out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
             print(out_message)
-            raise Exception(out_message)
-            #return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths#, out_message
         if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
         elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative or Positive"
@@ -1337,10 +1320,10 @@ def extract_topics(in_data_file,
                 if latest_batch_completed >= 1 or candidate_topics is not None:
                     # Prepare Gemini models before query
-                    if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
                         print("Using Gemini model:", model_choice)
                         model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens)
-                    elif model_choice in ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]:
                         print("Using AWS Bedrock model:", model_choice)
                     else:
                         print("Using local model:", model_choice)
@@ -1351,109 +1334,17 @@ def extract_topics(in_data_file,
                         # 'Zero shot topics' are those supplied by the user
                         max_topic_no = 120
-                        zero_shot_topics = read_file(candidate_topics.name)
-                        # Max 120 topics allowed
-                        if zero_shot_topics.shape[0] > max_topic_no:
-                            print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
-                            zero_shot_topics = zero_shot_topics.iloc[:max_topic_no, :]
-                        # Forward slashes in the topic names seems to confuse the model
-                        if zero_shot_topics.shape[1] >= 1:  # Check if there is at least one column
-                            for x in zero_shot_topics.columns:
-                                zero_shot_topics.loc[:, x] = (
-                                zero_shot_topics.loc[:, x]
-                                .str.strip()
-                                .str.replace('\n', ' ')
-                                .str.replace('\r', ' ')
-                                .str.replace('/', ' or ')
-                                .str.lower()
-                                .str.capitalize())
-                            # If number of columns is 1, keep only subtopics
-                            if zero_shot_topics.shape[1] == 1 and "General Topic" not in zero_shot_topics.columns:
-                                zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
-                                zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
-                            # Allow for possibility that the user only wants to set general topics and not subtopics
-                            elif zero_shot_topics.shape[1] == 1 and "General Topic" in zero_shot_topics.columns:
-                                zero_shot_topics_gen_topics_list = list(zero_shot_topics["General Topic"])
-                                zero_shot_topics_subtopics_list = [""] * zero_shot_topics.shape[0]
-                            # If general topic and subtopic are specified
-                            elif set(["General Topic", "Subtopic"]).issubset(zero_shot_topics.columns):
-                                zero_shot_topics_gen_topics_list = list(zero_shot_topics["General Topic"])
-                                zero_shot_topics_subtopics_list = list(zero_shot_topics["Subtopic"])
-                            # If number of columns is 2, keep general topics and subtopics
-                            elif zero_shot_topics.shape[1] == 2:
-                                zero_shot_topics_gen_topics_list = list(zero_shot_topics.iloc[:, 0])
-                                zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 1])
-                            else:
-                                # If there are more columns, just assume that the first column was meant to be a subtopic
-                                zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
-                                zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
-                            # If the responses are being forced into zero shot topics, allow an option for nothing relevant
-                            if force_zero_shot_radio == "Yes":
-                                zero_shot_topics_gen_topics_list.append("")
-                                zero_shot_topics_subtopics_list.append("No topics are relevant to the response")
-                            if create_revised_general_topics == True:
-                                # Create the most up to date list of topics and subtopics.
-                                # If there are candidate topics, but the existing_unique_topics_df hasn't yet been constructed, then create.
-                                unique_topics_df = pd.DataFrame(data={
-                                    "General Topic":zero_shot_topics_gen_topics_list,
-                                    "Subtopic":zero_shot_topics_subtopics_list
-                                    })
-                                unique_topics_markdown = unique_topics_df.to_markdown()
-                                print("unique_topics_markdown:", unique_topics_markdown)
-                                formatted_general_topics_system_prompt = create_general_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
-                                # Format the general_topics prompt with the topics
-                                formatted_general_topics_prompt = create_general_topics_prompt.format(topics=unique_topics_markdown)
-                                if model_choice == "gemma_2b_it_local":
-                                    formatted_general_topics_prompt = llama_cpp_prefix + formatted_general_topics_system_prompt + "\n" + formatted_general_topics_prompt + llama_cpp_suffix
-                                formatted_general_topics_prompt_list = [formatted_general_topics_prompt]
-                                whole_conversation = []
-                                general_topic_response, general_topic_conversation_history, general_topic_conversation, general_topic_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
-                                # Convert response text to a markdown table
-                                try:
-                                    zero_shot_topics_df, is_error = convert_response_text_to_markdown_table(response_text, table_type = "Revised topics table")
-                                    print("Output revised zero shot topics table is:", zero_shot_topics_df)
-                                    zero_shot_revised_path = output_folder + "zero_shot_topics_with_general_topics.csv"
-                                    #zero_shot_topics_df.to_csv(zero_shot_revised_path, index = None)
-                                    out_file_paths.append(zero_shot_revised_path)
-                                except Exception as e:
-                                    print("Error in parsing markdown table from response text:", e, "Not adding revised General Topics to table")
-                                    zero_shot_topics_df = pd.DataFrame(data={
-                                        "General Topic":zero_shot_topics_gen_topics_list,
-                                        "Subtopic":zero_shot_topics_subtopics_list})
-                                if zero_shot_topics_df.empty:
-                                    print("Creation of revised general topics df failed, reverting to original list")
-                                    zero_shot_topics_df = pd.DataFrame(data={
-                                        "General Topic":zero_shot_topics_gen_topics_list,
-                                        "Subtopic":zero_shot_topics_subtopics_list})
-                            else:
-                                zero_shot_topics_df = pd.DataFrame(data={
-                                    "General Topic":zero_shot_topics_gen_topics_list,
-                                    "Subtopic":zero_shot_topics_subtopics_list})
-                            # This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
-                            if not existing_unique_topics_df.empty:
-                                existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df]).drop_duplicates("Subtopic")
-                            else:
-                                existing_unique_topics_df = zero_shot_topics_df
                     if candidate_topics and not zero_shot_topics_df.empty:
                         # If you have already created revised zero shot topics, concat to the current
@@ -1464,24 +1355,40 @@ def extract_topics(in_data_file,
                     existing_unique_topics_df.fillna("", inplace=True)
                     existing_unique_topics_df["General Topic"] = existing_unique_topics_df["General Topic"].str.replace('(?i)^Nan$', '', regex=True)
                     existing_unique_topics_df["Subtopic"] = existing_unique_topics_df["Subtopic"].str.replace('(?i)^Nan$', '', regex=True)
                     # print("existing_unique_topics_df:", existing_unique_topics_df)
                     # If user has chosen to try to force zero shot topics, then the prompt is changed to ask the model not to deviate at all from submitted topic list.
-                    if force_zero_shot_radio == "Yes":
-                        unique_topics_markdown = existing_unique_topics_df[["Subtopic"]].drop_duplicates(["Subtopic"]).to_markdown(index=False)
                         topic_assignment_prompt = force_existing_topics_prompt
                     else:
-                        unique_topics_markdown = existing_unique_topics_df[["General Topic", "Subtopic"]].drop_duplicates(["General Topic", "Subtopic"]).to_markdown(index=False)
-                        topic_assignment_prompt = allow_new_topics_prompt
                     # Format the summary prompt with the response table and topics
                     formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
-                    formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown, topic_assignment=topic_assignment_prompt, sentiment_choices=sentiment_prompt)
-                    if model_choice == "gemma_2b_it_local":
                         formatted_summary_prompt = llama_cpp_prefix + formatted_system_prompt + "\n" + formatted_summary_prompt + llama_cpp_suffix
                         full_prompt = formatted_summary_prompt
                     else:
@@ -1499,7 +1406,7 @@ def extract_topics(in_data_file,
                     except Exception as e:
                         print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
-                    if model_choice == "gemma_2b_it_local":
                         summary_prompt_list = [full_prompt] # Includes system prompt
                     else:
                         summary_prompt_list = [formatted_summary_prompt]
@@ -1510,13 +1417,9 @@ def extract_topics(in_data_file,
                     whole_conversation = []
                     # Process requests to large language model
-                    # responses, summary_conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(summary_prompt_list, add_existing_topics_system_prompt, summary_conversation_history, summary_whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, master = True)
                     responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
-                    # print("responses:", responses[-1].text)
-                    # print("Whole conversation metadata:", whole_conversation_metadata)
                     topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, new_topic_df, new_markdown_table, new_reference_df, new_unique_topics_df, master_batch_out_file_part, is_error =  write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, first_run=False)
                     # Write final output to text file for logging purposes
@@ -1541,7 +1444,6 @@ def extract_topics(in_data_file,
                     if is_error == True:
                         final_message_out = "Could not complete summary, error in LLM output."
                         raise Exception(final_message_out)
-                        #return unique_table_df_display_table_markdown, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
                     # Write outputs to csv
                     ## Topics with references
@@ -1560,7 +1462,7 @@ def extract_topics(in_data_file,
                     # Outputs for markdown table output
                     unique_table_df_display_table = new_unique_topics_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
-                    unique_table_df_display_table_markdown = unique_table_df_display_table.to_markdown(index=False)
                     #whole_conversation_metadata.append(whole_conversation_metadata_str)
                     whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
@@ -1579,11 +1481,11 @@ def extract_topics(in_data_file,
                     #system_prompt = system_prompt + normalised_simple_markdown_table
                     # Prepare Gemini models before query
-                    if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
                         print("Using Gemini model:", model_choice)
                         model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
-                    elif model_choice in ["gemma_2b_it_local"]:
-                        print("Using local Gemma 2b model")
                     else:
                         print("Using AWS Bedrock model:", model_choice)
@@ -1597,7 +1499,7 @@ def extract_topics(in_data_file,
                     if prompt3: formatted_prompt3 = prompt3.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt)
                     else: formatted_prompt3 = prompt3
-                    if model_choice == "gemma_2b_it_local":
                         formatted_initial_table_prompt = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_initial_table_prompt + llama_cpp_suffix
                         formatted_prompt2 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt2 + llama_cpp_suffix
                         formatted_prompt3 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt3 + llama_cpp_suffix
@@ -1703,6 +1605,8 @@ def extract_topics(in_data_file,
         # Set to a very high number so as not to mess with subsequent file processing by the user
         #latest_batch_completed = 999
         toc = time.perf_counter()
         final_time = (toc - tic) + time_taken
         out_time = f"Everything finished in {round(final_time,1)} seconds."
@@ -1733,6 +1637,7 @@ def extract_topics(in_data_file,
         ## Reference table mapping response numbers to topics
         existing_reference_df.to_csv(reference_table_out_path, index=None)
         out_file_paths.append(reference_table_out_path)
         # Create final unique topics table from reference table to ensure consistent numbers
         final_out_unique_topics_df = create_unique_table_df_from_reference_table(existing_reference_df)
@@ -1787,13 +1692,10 @@ def extract_topics(in_data_file,
         print("latest_batch_completed at end of batch iterations to return is", latest_batch_completed)
-        return unique_table_df_display_table_markdown, existing_topics_table, final_out_unique_topics_df, existing_reference_df, final_out_file_paths, final_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, final_out_file_paths, final_out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), final_out_file_paths
-    return unique_table_df_display_table_markdown, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths
 def convert_reference_table_to_pivot_table(df:pd.DataFrame, basic_response_data:pd.DataFrame=pd.DataFrame()):
@@ -2302,7 +2204,7 @@ def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:
     whole_conversation_metadata = []
     # Prepare Gemini models before query
-    if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
         print("Using Gemini model:", model_choice)
         model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
     else:
@@ -2464,7 +2366,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
     if (model_choice == "gemma_2b_it_local") & (RUN_LOCAL_MODEL == "1"):
                 progress(0.1, "Loading in Gemma 2b model")
                 local_model, tokenizer = load_model()
-                print("Local model loaded:", local_model)
     summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
     summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")

 GradioFileData = gr.FileData
+from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt, force_existing_topics_prompt, allow_new_topics_prompt, force_single_topic_prompt
+from tools.helper_functions import output_folder, read_file, get_or_create_env_var, model_name_map, put_columns_in_df, wrap_text, initial_clean, load_in_data_file, load_in_file
 from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
 # ResponseObject class for AWS Bedrock calls
     return text
 def load_in_previous_data_files(file_paths_partial_output:List[str], for_modified_table:bool=False):
     '''Load in data table from a partially completed consultation summary to continue it.'''
         return gr.Dataframe(value=unique_file_data, headers=None, col_count=(unique_file_data.shape[1], "fixed"), row_count = (unique_file_data.shape[0], "fixed"), visible=True, type="pandas"), reference_file_data, unique_file_data, reference_file_name, unique_file_name, out_file_names
 def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str], verify_titles:bool=False) -> pd.DataFrame:
     if not isinstance(chosen_cols, list):
     if verify_titles == True:
         basic_response_data = basic_response_data.rename(columns={chosen_cols[0]: "Response", chosen_cols[1]: "Title"})
         basic_response_data["Title"] = basic_response_data["Title"].str.strip()
+        basic_response_data["Title"] = basic_response_data["Title"].apply(initial_clean)
     else:
         basic_response_data = basic_response_data.rename(columns={chosen_cols[0]: "Response"})
     basic_response_data["Response"] = basic_response_data["Response"].str.strip()
+    basic_response_data["Response"] = basic_response_data["Response"].apply(initial_clean)
     return basic_response_data
     else:
         end_row = file_len + 1
+    #print("start_row:", start_row)
+    #print("end_row:", end_row)
     batch_basic_response_data = basic_response_data[start_row:end_row]  # Select the current batch
+    #print("batch_basic_response_data:", batch_basic_response_data)
     # Now replace the reference numbers with numbers starting from 1
     batch_basic_response_data.loc[:, "Reference"] = batch_basic_response_data["Reference"] - start_row
     # Now you can access both the text and metadata
     #print("Text:", response.text)
+    #print("Metadata:", response.usage_metadata)
     #print("Text:", response.text)
     return response
     progress_bar = range(0,number_of_api_retry_attempts)
     # Generate the model's response
+    if "gemini" in model_choice:
         for i in progress_bar:
             try:
             if i == number_of_api_retry_attempts:
                 return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
+    elif "anthropic.claude" in model_choice:
         for i in progress_bar:
             try:
                 print("Calling AWS Claude model, attempt", i + 1)
     return result
 def clean_column_name(column_name, max_length=20):
     # Convert to string
     column_name = str(column_name)
             .assign(Topic_number=lambda df: np.arange(1, len(df) + 1))  # Add numbering 1 to x
         )
     return out_unique_topics_df
 # Convert output table to markdown and then to a pandas dataframe to csv
             call_temperature, reported_batch_no, local_model, master=master
         )
         if (model_choice != "gemma_local") & (model_choice != "gemma_2b_it_local"):
             stripped_response = responses[-1].text.strip()
         else:
         return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
     # Rename columns to ensure consistent use of data frames later in code
+    new_column_names = {
+    topic_with_response_df.columns[0]: "General Topic",
+    topic_with_response_df.columns[1]: "Subtopic",
+    topic_with_response_df.columns[2]: "Sentiment",
+    topic_with_response_df.columns[3]: "Response References",
+    topic_with_response_df.columns[4]: "Summary"
+    }
+    topic_with_response_df = topic_with_response_df.rename(columns=new_column_names)
     # Fill in NA rows with values from above (topics seem to be included only on one row):
     topic_with_response_df = topic_with_response_df.ffill()
         sentiment = row.iloc[2] if pd.notna(row.iloc[2]) else ""
         summary = row.iloc[4] if pd.notna(row.iloc[4]) else ""
         # If the reference response column is very long, and there's nothing in the summary column, assume that the summary was put in the reference column
+        if not summary and len(str(row.iloc[3]) > 30):
+            summary = row.iloc[3]
         summary = row_number_string_start + summary
     return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
+def generate_zero_shot_topics_df(zero_shot_topics:pd.DataFrame,
+                                 force_zero_shot_radio:str="No",
+                                 create_revised_general_topics:bool=False,
+                                 max_topic_no:int=120):
+    # Max 120 topics allowed
+    if zero_shot_topics.shape[0] > max_topic_no:
+        print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
+        zero_shot_topics = zero_shot_topics.iloc[:max_topic_no, :]
+    # Forward slashes in the topic names seems to confuse the model
+    if zero_shot_topics.shape[1] >= 1:  # Check if there is at least one column
+        for x in zero_shot_topics.columns:
+            if not zero_shot_topics[x].isnull().all():
+                zero_shot_topics[x] = zero_shot_topics[x].apply(initial_clean)
+                zero_shot_topics.loc[:, x] = (
+                zero_shot_topics.loc[:, x]
+                .str.strip()
+                .str.replace('\n', ' ')
+                .str.replace('\r', ' ')
+                .str.replace('/', ' or ')
+                .str.lower()
+                .str.capitalize())
+        #print("zero_shot_topics:", zero_shot_topics)
+        # If number of columns is 1, keep only subtopics
+        if zero_shot_topics.shape[1] == 1 and "General topic" not in zero_shot_topics.columns:
+            zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
+            zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
+        # Allow for possibility that the user only wants to set general topics and not subtopics
+        elif zero_shot_topics.shape[1] == 1 and "General topic" in zero_shot_topics.columns:
+            zero_shot_topics_gen_topics_list = list(zero_shot_topics["General Topic"])
+            zero_shot_topics_subtopics_list = [""] * zero_shot_topics.shape[0]
+        # If general topic and subtopic are specified
+        elif set(["General topic", "Subtopic"]).issubset(zero_shot_topics.columns):
+            print("Found General topic and Subtopic in zero shot topics")
+            zero_shot_topics_gen_topics_list = list(zero_shot_topics["General topic"])
+            zero_shot_topics_subtopics_list = list(zero_shot_topics["Subtopic"])
+        # If number of columns is at least 2, keep general topics and subtopics
+        elif zero_shot_topics.shape[1] >= 2 and "Description" not in zero_shot_topics.columns:
+            zero_shot_topics_gen_topics_list = list(zero_shot_topics.iloc[:, 0])
+            zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 1])
+        else:
+            # If there are more columns, just assume that the first column was meant to be a subtopic
+            zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
+            zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
+        # Add a description if column is present
+        # print("zero_shot_topics.shape[1]:", zero_shot_topics.shape[1])
+        if "Description" in zero_shot_topics.columns:
+            zero_shot_topics_description_list = list(zero_shot_topics["Description"])
+            #print("Description found in topic title. List is:", zero_shot_topics_description_list)
+        elif zero_shot_topics.shape[1] >= 3:
+            zero_shot_topics_description_list = list(zero_shot_topics.iloc[:, 2]) # Assume the third column is description
+        else:
+            zero_shot_topics_description_list = [""] * zero_shot_topics.shape[0]
+        # If the responses are being forced into zero shot topics, allow an option for nothing relevant
+        if force_zero_shot_radio == "Yes":
+            zero_shot_topics_gen_topics_list.append("")
+            zero_shot_topics_subtopics_list.append("No relevant topic")
+            zero_shot_topics_description_list.append("")
+        if create_revised_general_topics == True:
+            pass
+            # The following currently doesn't really work. Excluded for now.
+            # unique_topics_df = pd.DataFrame(data={
+            #     "General Topic":zero_shot_topics_gen_topics_list,
+            #     "Subtopic":zero_shot_topics_subtopics_list,
+            #     "Description": zero_shot_topics_description_list
+            #     })
+            # unique_topics_markdown = unique_topics_df.to_markdown()
+            # #print("unique_topics_markdown:", unique_topics_markdown)
+            # formatted_general_topics_system_prompt = create_general_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
+            # # Format the general_topics prompt with the topics
+            # formatted_general_topics_prompt = create_general_topics_prompt.format(topics=unique_topics_markdown)
+            # if "gemma" in model_choice:
+            #     formatted_general_topics_prompt = llama_cpp_prefix + formatted_general_topics_system_prompt + "\n" + formatted_general_topics_prompt + llama_cpp_suffix
+            # formatted_general_topics_prompt_list = [formatted_general_topics_prompt]
+            # whole_conversation = []
+            # general_topic_response, general_topic_conversation_history, general_topic_conversation, general_topic_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
+            # # Convert response text to a markdown table
+            # try:
+            #     zero_shot_topics_df, is_error = convert_response_text_to_markdown_table(response_text, table_type = "Revised topics table")
+            #     print("Output revised zero shot topics table is:", zero_shot_topics_df)
+            #     zero_shot_revised_path = output_folder + "zero_shot_topics_with_general_topics.csv"
+            #     out_file_paths.append(zero_shot_revised_path)
+            # except Exception as e:
+            #     print("Error in parsing markdown table from response text:", e, "Not adding revised General Topics to table")
+            # if zero_shot_topics_df.empty:
+            #     print("Creation of revised general topics df failed, reverting to original list")
+        else:
+            pass
+        # Add description or not
+        zero_shot_topics_df = pd.DataFrame(data={
+                "General Topic":zero_shot_topics_gen_topics_list,
+                "Subtopic":zero_shot_topics_subtopics_list,
+                "Description": zero_shot_topics_description_list
+                })
+        #if not zero_shot_topics_df["Description"].isnull().all():
+        #    zero_shot_topics_df["Description"] = zero_shot_topics_df["Description"].apply(initial_clean)
+        return zero_shot_topics_df
 @spaces.GPU
 def extract_topics(in_data_file,
               file_data:pd.DataFrame,
               sentiment_checkbox:str = "Negative, Neutral, or Positive",
               force_zero_shot_radio:str = "No",
               in_excel_sheets:List[str] = [],
+              force_single_topic_radio:str = "No",
+              force_single_topic_prompt:str=force_single_topic_prompt,
               max_tokens:int=max_tokens,
               model_name_map:dict=model_name_map,
               max_time_for_loop:int=max_time_for_loop,
     - time_taken (float, optional): The amount of time taken to process the responses up until this point.
     - sentiment_checkbox (str, optional): What type of sentiment analysis should the topic modeller do?
     - force_zero_shot_radio (str, optional): Should responses be forced into a zero shot topic or not.
+    - in_excel_sheets (List[str], optional): List of excel sheets to load from input file.
+    - force_single_topic_radio (str, optional): Should the model be forced to assign only one single topic to each response (effectively a classifier).
+    - force_single_topic_prompt (str, optional): The prompt for forcing the model to assign only one single topic to each response.
     - max_tokens (int): The maximum number of tokens for the model.
     - model_name_map (dict, optional): A dictionary mapping full model name to shortened.
     - max_time_for_loop (int, optional): The number of seconds maximum that the function should run for before breaking (to run again, this is to avoid timeouts with some AWS services if deployed there).
     if file_data.empty:
         print("No data table found, loading from file")
         try:
             in_colnames_drop, in_excel_sheets, file_name = put_columns_in_df(in_data_file)
             file_data, file_name, num_batches = load_in_data_file(in_data_file, chosen_cols, batch_size_default, in_excel_sheets)
         except:
             # Check if files and text exist
             out_message = "Please enter a data file to summarise."
             print(out_message)
             raise Exception(out_message)
     #model_choice_clean = replace_punctuation_with_underscore(model_choice)
             latest_batch_completed = 0
             out_message = []
             out_file_paths = []
             if (model_choice == "gemma_2b_it_local") & (RUN_LOCAL_MODEL == "1"):
                 progress(0.1, "Loading in Gemma 2b model")
                 local_model, tokenizer = load_model()
     if num_batches > 0:
         progress_measure = round(latest_batch_completed / num_batches, 1)
             out_file_paths = []
+        if "anthropic.claude-3-sonnet" in model_choice and file_data.shape[1] > 300:
             out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
             print(out_message)
+            raise Exception(out_message)
         if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
         elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative or Positive"
                 if latest_batch_completed >= 1 or candidate_topics is not None:
                     # Prepare Gemini models before query
+                    if "gemini" in model_choice:
                         print("Using Gemini model:", model_choice)
                         model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens)
+                    elif "anthropic.claude" in model_choice:
                         print("Using AWS Bedrock model:", model_choice)
                     else:
                         print("Using local model:", model_choice)
                         # 'Zero shot topics' are those supplied by the user
                         max_topic_no = 120
+                        zero_shot_topics = read_file(candidate_topics.name)
+                        zero_shot_topics_df = generate_zero_shot_topics_df(zero_shot_topics, force_zero_shot_radio, create_revised_general_topics, max_topic_no)
+                        #print("zero_shot_topics_df:", zero_shot_topics_df)
+                        # This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
+                        if not existing_unique_topics_df.empty and force_zero_shot_radio != "Yes":
+                            existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df]).drop_duplicates("Subtopic")
+                        else:
+                            existing_unique_topics_df = zero_shot_topics_df
                     if candidate_topics and not zero_shot_topics_df.empty:
                         # If you have already created revised zero shot topics, concat to the current
                     existing_unique_topics_df.fillna("", inplace=True)
                     existing_unique_topics_df["General Topic"] = existing_unique_topics_df["General Topic"].str.replace('(?i)^Nan$', '', regex=True)
                     existing_unique_topics_df["Subtopic"] = existing_unique_topics_df["Subtopic"].str.replace('(?i)^Nan$', '', regex=True)
+                    existing_unique_topics_df = existing_unique_topics_df.drop_duplicates()
+                    if "Description" in existing_unique_topics_df:
+                        if existing_unique_topics_df['Description'].isnull().all():
+                            existing_unique_topics_df.drop("Description", axis = 1, inplace = True)
                     # print("existing_unique_topics_df:", existing_unique_topics_df)
                     # If user has chosen to try to force zero shot topics, then the prompt is changed to ask the model not to deviate at all from submitted topic list.
+                    keep_cols = [
+                        col for col in ["General Topic", "Subtopic", "Description"]
+                        if col in existing_unique_topics_df.columns
+                        and not existing_unique_topics_df[col].replace(r'^\s*$', pd.NA, regex=True).isna().all()
+                        ]
+                    if force_zero_shot_radio == "Yes":
+                        topics_df_for_markdown = existing_unique_topics_df[keep_cols].drop_duplicates(keep_cols)
+                        unique_topics_markdown = topics_df_for_markdown.to_markdown(index=False)
                         topic_assignment_prompt = force_existing_topics_prompt
                     else:
+                        topics_df_for_markdown = existing_unique_topics_df[keep_cols].drop_duplicates(keep_cols)
+                        unique_topics_markdown = topics_df_for_markdown.to_markdown(index=False)
+                        topic_assignment_prompt = allow_new_topics_prompt
+                    # Should the outputs force only one single topic assignment per response?
+                    if force_single_topic_radio != "Yes": force_single_topic_prompt = ""
+                    else:
+                        topic_assignment_prompt = topic_assignment_prompt.replace("Assign topics", "Assign a topic").replace("assign Subtopics", "assign a Subtopic").replace("Subtopics", "Subtopic").replace("Topics", "Topic").replace("topics", "a topic")
                     # Format the summary prompt with the response table and topics
                     formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
+                    formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown, topic_assignment=topic_assignment_prompt, force_single_topic=force_single_topic_prompt, sentiment_choices=sentiment_prompt)
+                    if "gemma" in model_choice:
                         formatted_summary_prompt = llama_cpp_prefix + formatted_system_prompt + "\n" + formatted_summary_prompt + llama_cpp_suffix
                         full_prompt = formatted_summary_prompt
                     else:
                     except Exception as e:
                         print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
+                    if "gemma" in model_choice:
                         summary_prompt_list = [full_prompt] # Includes system prompt
                     else:
                         summary_prompt_list = [formatted_summary_prompt]
                     whole_conversation = []
                     # Process requests to large language model
                     responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
+                    # Return output tables
                     topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, new_topic_df, new_markdown_table, new_reference_df, new_unique_topics_df, master_batch_out_file_part, is_error =  write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, first_run=False)
                     # Write final output to text file for logging purposes
                     if is_error == True:
                         final_message_out = "Could not complete summary, error in LLM output."
                         raise Exception(final_message_out)
                     # Write outputs to csv
                     ## Topics with references
                     # Outputs for markdown table output
                     unique_table_df_display_table = new_unique_topics_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
+                    unique_table_df_display_table_markdown = unique_table_df_display_table[["General Topic", "Subtopic", "Sentiment", "Response References", "Summary"]].to_markdown(index=False)
                     #whole_conversation_metadata.append(whole_conversation_metadata_str)
                     whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
                     #system_prompt = system_prompt + normalised_simple_markdown_table
                     # Prepare Gemini models before query
+                    if "gemini" in model_choice:
                         print("Using Gemini model:", model_choice)
                         model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
+                    elif "gemma" in model_choice:
+                        print("Using local Gemma model:", model_choice)
                     else:
                         print("Using AWS Bedrock model:", model_choice)
                     if prompt3: formatted_prompt3 = prompt3.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt)
                     else: formatted_prompt3 = prompt3
+                    if "gemma" in model_choice:
                         formatted_initial_table_prompt = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_initial_table_prompt + llama_cpp_suffix
                         formatted_prompt2 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt2 + llama_cpp_suffix
                         formatted_prompt3 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt3 + llama_cpp_suffix
         # Set to a very high number so as not to mess with subsequent file processing by the user
         #latest_batch_completed = 999
+        join_file_paths = []
         toc = time.perf_counter()
         final_time = (toc - tic) + time_taken
         out_time = f"Everything finished in {round(final_time,1)} seconds."
         ## Reference table mapping response numbers to topics
         existing_reference_df.to_csv(reference_table_out_path, index=None)
         out_file_paths.append(reference_table_out_path)
+        join_file_paths.append(reference_table_out_path)
         # Create final unique topics table from reference table to ensure consistent numbers
         final_out_unique_topics_df = create_unique_table_df_from_reference_table(existing_reference_df)
         print("latest_batch_completed at end of batch iterations to return is", latest_batch_completed)
+        return unique_table_df_display_table_markdown, existing_topics_table, final_out_unique_topics_df, existing_reference_df, final_out_file_paths, final_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, final_out_file_paths, final_out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), final_out_file_paths, join_file_paths
+    return unique_table_df_display_table_markdown, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths, join_file_paths
 def convert_reference_table_to_pivot_table(df:pd.DataFrame, basic_response_data:pd.DataFrame=pd.DataFrame()):
     whole_conversation_metadata = []
     # Prepare Gemini models before query
+    if "gemini" in model_choice:
         print("Using Gemini model:", model_choice)
         model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
     else:
     if (model_choice == "gemma_2b_it_local") & (RUN_LOCAL_MODEL == "1"):
                 progress(0.1, "Loading in Gemma 2b model")
                 local_model, tokenizer = load_model()
+                #print("Local model loaded:", local_model)
     summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
     summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")

tools/prompts.py CHANGED Viewed

@@ -29,14 +29,16 @@ In the first column, write 'Not assessed'. In the second column, assign Subtopic
 allow_new_topics_prompt = """Create a new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response References', and 'Summary'.
 In the first and second columns, assign General Topics and Subtopics to Responses. Assign topics from the Topics table above only if they are very relevant to the text of the Response. Fill in the General Topic and Sentiment for the Subtopic if they do not already exist. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General Topic and Subtopic as specific as possible. The subtopic should never be blank or empty."""
 add_existing_topics_prompt = """Responses are shown in the following Response table:
 {response_table}
 Topics known to be relevant to this dataset are shown in the following Topics table:
 {topics}
-Your task is to create one new markdown table, assigning responses from the Response table to existing topics, or to create new topics if no existing topics are relevant.
-{topic_assignment}
 {sentiment_choices}.
 In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
 In the fifth column, write a short summary of the Subtopic based on relevant responses - highlight specific issues that appear.
@@ -46,7 +48,6 @@ New table:"""
 # Return only one table in markdown format containing all relevant topics. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
 summarise_topic_descriptions_system_prompt = system_prompt
 summarise_topic_descriptions_prompt = """Below is a table with number of paragraphs related to the data from the open text column:
@@ -57,6 +58,10 @@ Your task is to make a consolidated summary of the above text. {summary_format}.
 Summary:"""
 ## The following didn't work well in testing and so is not currently used
@@ -74,16 +79,16 @@ New Topics table:"""
 verify_titles_system_prompt = system_prompt
-verify_titles_prompt = """Response numbers alongside the Response text and assigned titles are shown in the table below:
 {response_table}
-The criteria for a suitable Title for these responses is that they should be readable, concise, and fully encapsulate the main subject of the response.
 Create a markdown table with four columns.
 The first column is 'Response References', and should contain just the response number under consideration.
-The second column is 'Is this a suitable title', answer the question with 'Yes' or 'No', with no other text.
 The third column is 'Explanation', give a short explanation for your response in the second column.
-The fourth column is 'Alternative title', suggest an alternative title for the response that meet the criteria stated above.
 Do not add any other text to your response.
 Output markdown table:"""

 allow_new_topics_prompt = """Create a new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response References', and 'Summary'.
 In the first and second columns, assign General Topics and Subtopics to Responses. Assign topics from the Topics table above only if they are very relevant to the text of the Response. Fill in the General Topic and Sentiment for the Subtopic if they do not already exist. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General Topic and Subtopic as specific as possible. The subtopic should never be blank or empty."""
+force_single_topic_prompt = """ Wherever possible, assign a response to one single topic, unless there are multiple topics that are equally relevant."""
 add_existing_topics_prompt = """Responses are shown in the following Response table:
 {response_table}
 Topics known to be relevant to this dataset are shown in the following Topics table:
 {topics}
+Your task is to create one new markdown table, assigning responses from the Response table to topics.
+{topic_assignment}{force_single_topic}
 {sentiment_choices}.
 In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
 In the fifth column, write a short summary of the Subtopic based on relevant responses - highlight specific issues that appear.
 # Return only one table in markdown format containing all relevant topics. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
 summarise_topic_descriptions_system_prompt = system_prompt
 summarise_topic_descriptions_prompt = """Below is a table with number of paragraphs related to the data from the open text column:
 Summary:"""
+single_para_summary_format_prompt = "Return a concise summary up to one paragraph long that summarises only the most important themes from the original text"
+two_para_summary_format_prompt = "Return a summary up to two paragraphs long that includes as much detail as possible from the original text"
 ## The following didn't work well in testing and so is not currently used
 verify_titles_system_prompt = system_prompt
+verify_titles_prompt = """Response numbers alongside the Response text and assigned descriptions are shown in the table below:
 {response_table}
+The criteria for a suitable description for these responses is that they should be readable, concise, and fully encapsulate the main subject of the response.
 Create a markdown table with four columns.
 The first column is 'Response References', and should contain just the response number under consideration.
+The second column is 'Is this a suitable description', answer the question with 'Yes' or 'No', with no other text.
 The third column is 'Explanation', give a short explanation for your response in the second column.
+The fourth column is 'Alternative description', suggest an alternative description for the response that meet the criteria stated above.
 Do not add any other text to your response.
 Output markdown table:"""