seanpedrickcase commited on
Commit
99b54b3
·
1 Parent(s): 7122331

Package updates. Can now ask the model to only assign specified topics

Browse files
.dockerignore CHANGED
@@ -5,6 +5,7 @@
5
  *.ipynb
6
  *.xls
7
  *.xlsx
 
8
  examples/*
9
  output/*
10
  tools/__pycache__/*
 
5
  *.ipynb
6
  *.xls
7
  *.xlsx
8
+ *.csv
9
  examples/*
10
  output/*
11
  tools/__pycache__/*
.gitignore CHANGED
@@ -5,6 +5,7 @@
5
  *.ipynb
6
  *.xls
7
  *.xlsx
 
8
  examples/*
9
  output/*
10
  tools/__pycache__/*
 
5
  *.ipynb
6
  *.xls
7
  *.xlsx
8
+ *.csv
9
  examples/*
10
  output/*
11
  tools/__pycache__/*
app.py CHANGED
@@ -1,11 +1,11 @@
1
  import os
2
  import socket
3
  import spaces
4
- from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL
5
  from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
6
  from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default, deduplicate_topics, modify_existing_output_tables
7
  from tools.auth import authenticate_user
8
- from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, verify_titles_prompt, verify_titles_system_prompt
9
  from tools.verify_titles import verify_titles
10
  #from tools.aws_functions import load_data_from_aws
11
  import gradio as gr
@@ -22,18 +22,14 @@ host_name = socket.gethostname()
22
  access_logs_data_folder = 'logs/' + today_rev + '/' + host_name + '/'
23
  feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
24
  usage_data_folder = 'usage/' + today_rev + '/' + host_name + '/'
25
- file_input_height = 150
26
-
27
- print("RUN_LOCAL_MODEL is:", RUN_LOCAL_MODEL)
28
 
29
  if RUN_LOCAL_MODEL == "1":
30
  default_model_choice = "gemma_2b_it_local"
31
-
32
  elif RUN_AWS_FUNCTIONS == "1":
33
  default_model_choice = "anthropic.claude-3-haiku-20240307-v1:0"
34
-
35
  else:
36
- default_model_choice = "gemini-2.0-flash"
37
 
38
  # Create the gradio interface
39
  app = gr.Blocks(theme = gr.themes.Base())
@@ -95,7 +91,7 @@ with app:
95
 
96
  Instructions on use can be found in the README.md file. Try it out with this [dummy development consultation dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation), which you can also try with [zero-shot topics](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation/blob/main/example_zero_shot.csv), or this [dummy case notes dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_case_notes).
97
 
98
- You can use an AWS Bedrock model (Claude 3, paid), or Gemini (a free API, but with strict limits for the Pro model). Due to the strict API limits for the best model (Pro 1.5), the use of Gemini requires an API key. To set up your own Gemini API key, go [here](https://aistudio.google.com/app/u/1/plan_information).
99
 
100
  NOTE: that **API calls to Gemini are not considered secure**, so please only submit redacted, non-sensitive tabular files to this source. Also, large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
101
 
@@ -107,7 +103,7 @@ with app:
107
  )
108
  with gr.Row():
109
  model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
110
- in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
111
 
112
  with gr.Accordion("Upload xlsx or csv file", open = True):
113
  in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
@@ -116,12 +112,14 @@ with app:
116
  in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
117
 
118
  with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
119
- candidate_topics = gr.File(height=file_input_height, label="Input topics from file (csv). File should have at least one column with a header, and all topic names below this. Using the headers 'General Topic' and/or 'Subtopic' will allow for these columns to be suggested to the model.")
120
- force_zero_shot_radio = gr.Radio(label="Force responses into zero shot topics", value="No", choices=["Yes", "No"])
 
 
121
 
122
  context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
123
 
124
- sentiment_checkbox = gr.Radio(label="Choose sentiment categories to split responses", value="Negative, Neutral, or Positive", choices=["Negative, Neutral, or Positive", "Negative or Positive", "Do not assess sentiment"])
125
 
126
  extract_topics_btn = gr.Button("Extract topics", variant="primary")
127
 
@@ -153,10 +151,7 @@ with app:
153
 
154
  save_modified_files_button = gr.Button(value="Save modified topic names")
155
 
156
-
157
- with gr.Accordion("Upload reference data file and unique data files", open = True):
158
-
159
-
160
  ### DEDUPLICATION
161
  deduplication_input_files = gr.File(height=file_input_height, label="Upload files to deduplicate topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
162
  deduplication_input_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
@@ -168,11 +163,10 @@ with app:
168
 
169
  deduplicate_previous_data_btn = gr.Button("Deduplicate topics", variant="primary")
170
 
171
-
172
- ### SUMMARISATION
173
  summarisation_input_files = gr.File(height=file_input_height, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
174
 
175
- summarise_format_radio = gr.Radio(label="Choose summary type", value="Return a summary up to two paragraphs long that includes as much detail as possible from the original text", choices=["Return a summary up to two paragraphs long that includes as much detail as possible from the original text", "Return a concise summary up to one paragraph long that summarises only the most important themes from the original text"])
176
 
177
  summarise_previous_data_btn = gr.Button("Summarise topics", variant="primary")
178
  summary_output_files = gr.File(height=file_input_height, label="Summarised output files", interactive=False)
@@ -198,10 +192,10 @@ with app:
198
  in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
199
  view_table_markdown = gr.Markdown(value = "", label="View table", show_copy_button=True)
200
 
201
- with gr.Tab(label="Verify titles"):
202
  gr.Markdown(
203
  """
204
- ### Choose a tabular data file (xlsx or csv) with titles and original text to verify titles/descriptions for.
205
  """
206
  )
207
  with gr.Row():
@@ -212,11 +206,11 @@ with app:
212
  verify_in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
213
 
214
  verify_in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
215
- verify_in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = True, label="Select the open text columns that have a response and a title. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
216
  #verify_title_colnames = gr.Dropdown(choices=["Choose column with titles"], multiselect = False, label="Select the open text columns that have a title. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
217
 
218
- verify_titles_btn = gr.Button("Verify titles", variant="primary")
219
- verify_titles_file_output = gr.File(height=file_input_height, label="Title verification output files")
220
  verify_display_topic_table_markdown = gr.Markdown(value="### Language model response will appear here", show_copy_button=True)
221
 
222
  verify_modification_input_files_placeholder = gr.File(height=file_input_height, label="Placeholder for files to avoid errors", visible=False)
@@ -231,7 +225,7 @@ with app:
231
  batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = batch_size_default, precision=0, minimum=1, maximum=100)
232
  random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
233
 
234
- with gr.Accordion("Prompt settings", open = True):
235
  number_of_prompts = gr.Number(value=1, label="Number of prompts to send to LLM in sequence", minimum=1, maximum=3, visible=False)
236
  system_prompt_textbox = gr.Textbox(label="Initial system prompt", lines = 4, value = system_prompt)
237
  initial_table_prompt_textbox = gr.Textbox(label = "Initial topics prompt", lines = 8, value = initial_table_prompt)
@@ -241,9 +235,17 @@ with app:
241
  add_to_existing_topics_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = add_existing_topics_prompt)
242
  verify_titles_system_prompt_textbox = gr.Textbox(label="Additional topics system prompt", lines = 4, value = verify_titles_system_prompt)
243
  verify_titles_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = verify_titles_prompt)
244
-
245
- log_files_output = gr.File(height=file_input_height, label="Log file output", interactive=False)
246
- conversation_metadata_textbox = gr.Textbox(label="Query metadata - usage counts and other parameters", interactive=False, lines=8)
 
 
 
 
 
 
 
 
247
 
248
  # Invisible text box to hold the session hash/username just for logging purposes
249
  session_hash_textbox = gr.Textbox(label = "Session hash", value="", visible=False)
@@ -271,25 +273,28 @@ with app:
271
  ###
272
 
273
  # Tabular data upload
274
- in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, reference_data_file_name_textbox])
275
 
276
  extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown]).\
277
  success(load_in_data_file,
278
  inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
279
  success(fn=extract_topics,
280
- inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio, in_excel_sheets],
281
- outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files], api_name="extract_topics")
282
 
283
 
284
  # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
285
  # latest_batch_completed.change(fn=extract_topics,
286
  # inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio, in_excel_sheets],
287
- # outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files]).\
288
  # success(fn = reveal_feedback_buttons,
289
  # outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
290
 
291
  # If you upload data into the deduplication input box, the modifiable topic dataframe box is updated
292
  modification_input_files.change(fn=load_in_previous_data_files, inputs=[modification_input_files, modified_unique_table_change_bool], outputs=[modifiable_unique_topics_df_state, master_modify_reference_df_state, master_modify_unique_topics_df_state, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, text_output_modify_file_list_state])
 
 
 
293
 
294
 
295
  # Modify output table with custom topic names
@@ -314,17 +319,29 @@ with app:
314
  load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches]).\
315
  success(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, reference_data_file_name_textbox])
316
 
317
- # VERIFY TITLES OR DESCRIPTIONS OF TEXT
318
 
319
  # Tabular data upload
320
- verify_in_data_files.upload(fn=put_columns_in_df, inputs=[verify_in_data_files], outputs=[verify_in_colnames, verify_in_excel_sheets, reference_data_file_name_textbox])
321
 
322
  verify_titles_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown]).\
323
  success(load_in_data_file,
324
  inputs = [verify_in_data_files, verify_in_colnames, batch_size_number, verify_in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="verify_load_data").\
325
  success(fn=verify_titles,
326
  inputs=[verify_in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, verify_in_api_key, temperature_slide, verify_in_colnames, verify_model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, verify_titles_prompt_textbox, prompt_2_textbox, prompt_3_textbox, verify_titles_system_prompt_textbox, verify_titles_system_prompt_textbox, verify_titles_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio, in_excel_sheets],
327
- outputs=[verify_display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, verify_titles_file_output, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, verify_modification_input_files_placeholder], api_name="verify_titles")
 
 
 
 
 
 
 
 
 
 
 
 
328
 
329
  ###
330
  # LOGGING AND ON APP LOAD FUNCTIONS
 
1
  import os
2
  import socket
3
  import spaces
4
+ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL, load_in_previous_reference_file, join_cols_onto_reference_df, GEMINI_API_KEY
5
  from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
6
  from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default, deduplicate_topics, modify_existing_output_tables
7
  from tools.auth import authenticate_user
8
+ from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, verify_titles_prompt, verify_titles_system_prompt, two_para_summary_format_prompt, single_para_summary_format_prompt
9
  from tools.verify_titles import verify_titles
10
  #from tools.aws_functions import load_data_from_aws
11
  import gradio as gr
 
22
  access_logs_data_folder = 'logs/' + today_rev + '/' + host_name + '/'
23
  feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
24
  usage_data_folder = 'usage/' + today_rev + '/' + host_name + '/'
25
+ file_input_height = 200
 
 
26
 
27
  if RUN_LOCAL_MODEL == "1":
28
  default_model_choice = "gemma_2b_it_local"
 
29
  elif RUN_AWS_FUNCTIONS == "1":
30
  default_model_choice = "anthropic.claude-3-haiku-20240307-v1:0"
 
31
  else:
32
+ default_model_choice = "gemini-2.0-flash-001"
33
 
34
  # Create the gradio interface
35
  app = gr.Blocks(theme = gr.themes.Base())
 
91
 
92
  Instructions on use can be found in the README.md file. Try it out with this [dummy development consultation dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation), which you can also try with [zero-shot topics](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation/blob/main/example_zero_shot.csv), or this [dummy case notes dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_case_notes).
93
 
94
+ You can use an AWS Bedrock model (Claude 3, paid), or Gemini (a free API, but with strict limits for the Pro model). The use of Gemini models requires an API key. To set up your own Gemini API key, go [here](https://aistudio.google.com/app/u/1/plan_information).
95
 
96
  NOTE: that **API calls to Gemini are not considered secure**, so please only submit redacted, non-sensitive tabular files to this source. Also, large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
97
 
 
103
  )
104
  with gr.Row():
105
  model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
106
+ in_api_key = gr.Textbox(value = GEMINI_API_KEY, label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
107
 
108
  with gr.Accordion("Upload xlsx or csv file", open = True):
109
  in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
 
112
  in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
113
 
114
  with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
115
+ candidate_topics = gr.File(height=file_input_height, label="Input topics from file (csv). File should have at least one column with a header, and all topic names below this. Using the headers 'General Topic' and/or 'Subtopic' will allow for these columns to be suggested to the model. If a third column is present, it will be assumed to be a topic description.")
116
+ with gr.Row(equal_height=True):
117
+ force_zero_shot_radio = gr.Radio(label="Force responses into zero shot topics", value="No", choices=["Yes", "No"])
118
+ force_single_topic_radio = gr.Radio(label="Ask the model to assign responses to only a single topic", value="No", choices=["Yes", "No"])
119
 
120
  context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
121
 
122
+ sentiment_checkbox = gr.Radio(label="Choose sentiment categories to split responses", value="Negative or Positive", choices=["Negative or Positive", "Negative, Neutral, or Positive", "Do not assess sentiment"])
123
 
124
  extract_topics_btn = gr.Button("Extract topics", variant="primary")
125
 
 
151
 
152
  save_modified_files_button = gr.Button(value="Save modified topic names")
153
 
154
+ with gr.Accordion("Upload reference data file and unique data files", open = True):
 
 
 
155
  ### DEDUPLICATION
156
  deduplication_input_files = gr.File(height=file_input_height, label="Upload files to deduplicate topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
157
  deduplication_input_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
 
163
 
164
  deduplicate_previous_data_btn = gr.Button("Deduplicate topics", variant="primary")
165
 
166
+ ### SUMMARISATION
 
167
  summarisation_input_files = gr.File(height=file_input_height, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
168
 
169
+ summarise_format_radio = gr.Radio(label="Choose summary type", value=two_para_summary_format_prompt, choices=[two_para_summary_format_prompt, single_para_summary_format_prompt])
170
 
171
  summarise_previous_data_btn = gr.Button("Summarise topics", variant="primary")
172
  summary_output_files = gr.File(height=file_input_height, label="Summarised output files", interactive=False)
 
192
  in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
193
  view_table_markdown = gr.Markdown(value = "", label="View table", show_copy_button=True)
194
 
195
+ with gr.Tab(label="Verify descriptions"):
196
  gr.Markdown(
197
  """
198
+ ### Choose a tabular data file (xlsx or csv) with titles and original text to verify descriptions for.
199
  """
200
  )
201
  with gr.Row():
 
206
  verify_in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
207
 
208
  verify_in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
209
+ verify_in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = True, label="Select the open text columns that have a response and a title/description. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
210
  #verify_title_colnames = gr.Dropdown(choices=["Choose column with titles"], multiselect = False, label="Select the open text columns that have a title. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
211
 
212
+ verify_titles_btn = gr.Button("Verify descriptions", variant="primary")
213
+ verify_titles_file_output = gr.File(height=file_input_height, label="Descriptions verification output files")
214
  verify_display_topic_table_markdown = gr.Markdown(value="### Language model response will appear here", show_copy_button=True)
215
 
216
  verify_modification_input_files_placeholder = gr.File(height=file_input_height, label="Placeholder for files to avoid errors", visible=False)
 
225
  batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = batch_size_default, precision=0, minimum=1, maximum=100)
226
  random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
227
 
228
+ with gr.Accordion("Prompt settings", open = False):
229
  number_of_prompts = gr.Number(value=1, label="Number of prompts to send to LLM in sequence", minimum=1, maximum=3, visible=False)
230
  system_prompt_textbox = gr.Textbox(label="Initial system prompt", lines = 4, value = system_prompt)
231
  initial_table_prompt_textbox = gr.Textbox(label = "Initial topics prompt", lines = 8, value = initial_table_prompt)
 
235
  add_to_existing_topics_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = add_existing_topics_prompt)
236
  verify_titles_system_prompt_textbox = gr.Textbox(label="Additional topics system prompt", lines = 4, value = verify_titles_system_prompt)
237
  verify_titles_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = verify_titles_prompt)
238
+
239
+ with gr.Accordion("Join additional columns to reference file outputs", open = False):
240
+ join_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = True, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
241
+ with gr.Row():
242
+ in_join_files = gr.File(height=file_input_height, label="Reference file should go here. Original data file should be loaded on the first tab.")
243
+ join_cols_btn = gr.Button("Join columns to reference output", variant="primary")
244
+ out_join_files = gr.File(height=file_input_height, label="Output joined reference files will go here.")
245
+
246
+ with gr.Accordion("Logging outputs", open = False):
247
+ log_files_output = gr.File(height=file_input_height, label="Log file output", interactive=False)
248
+ conversation_metadata_textbox = gr.Textbox(label="Query metadata - usage counts and other parameters", interactive=False, lines=8)
249
 
250
  # Invisible text box to hold the session hash/username just for logging purposes
251
  session_hash_textbox = gr.Textbox(label = "Session hash", value="", visible=False)
 
273
  ###
274
 
275
  # Tabular data upload
276
+ in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, reference_data_file_name_textbox, join_colnames])
277
 
278
  extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown]).\
279
  success(load_in_data_file,
280
  inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
281
  success(fn=extract_topics,
282
+ inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio, in_excel_sheets, force_single_topic_radio],
283
+ outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files, in_join_files], api_name="extract_topics")
284
 
285
 
286
  # If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
287
  # latest_batch_completed.change(fn=extract_topics,
288
  # inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio, in_excel_sheets],
289
+ # outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files, in_join_files]).\
290
  # success(fn = reveal_feedback_buttons,
291
  # outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
292
 
293
  # If you upload data into the deduplication input box, the modifiable topic dataframe box is updated
294
  modification_input_files.change(fn=load_in_previous_data_files, inputs=[modification_input_files, modified_unique_table_change_bool], outputs=[modifiable_unique_topics_df_state, master_modify_reference_df_state, master_modify_unique_topics_df_state, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, text_output_modify_file_list_state])
295
+
296
+
297
+
298
 
299
 
300
  # Modify output table with custom topic names
 
319
  load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches]).\
320
  success(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, reference_data_file_name_textbox])
321
 
322
+ # VERIFY DESCRIPTIONS OF TEXT
323
 
324
  # Tabular data upload
325
+ verify_in_data_files.upload(fn=put_columns_in_df, inputs=[verify_in_data_files], outputs=[verify_in_colnames, verify_in_excel_sheets, reference_data_file_name_textbox, join_colnames])
326
 
327
  verify_titles_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown]).\
328
  success(load_in_data_file,
329
  inputs = [verify_in_data_files, verify_in_colnames, batch_size_number, verify_in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="verify_load_data").\
330
  success(fn=verify_titles,
331
  inputs=[verify_in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, verify_in_api_key, temperature_slide, verify_in_colnames, verify_model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, verify_titles_prompt_textbox, prompt_2_textbox, prompt_3_textbox, verify_titles_system_prompt_textbox, verify_titles_system_prompt_textbox, verify_titles_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio, in_excel_sheets],
332
+ outputs=[verify_display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, verify_titles_file_output, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, verify_modification_input_files_placeholder], api_name="verify_descriptions")
333
+
334
+ ###
335
+ # LLM SETTINGS PAGE
336
+ ###
337
+
338
+ reference_df_data_file_name_textbox = gr.Textbox(label="reference_df_data_file_name_textbox", visible=False)
339
+ master_reference_df_state_joined = gr.State(pd.DataFrame())
340
+
341
+ join_cols_btn.click(fn=load_in_previous_reference_file, inputs=[in_join_files], outputs=[master_reference_df_state, reference_df_data_file_name_textbox]).\
342
+ success(load_in_data_file,
343
+ inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches]).\
344
+ success(fn=join_cols_onto_reference_df, inputs=[master_reference_df_state, file_data_state, join_colnames, reference_df_data_file_name_textbox], outputs=[master_reference_df_state_joined, out_join_files])
345
 
346
  ###
347
  # LOGGING AND ON APP LOAD FUNCTIONS
requirements.txt CHANGED
@@ -1,7 +1,7 @@
1
  pandas==2.2.3
2
- gradio==5.23.3
3
  spaces==0.34.1
4
- boto3==1.37.29
5
  pyarrow==19.0.1
6
  openpyxl==3.1.3
7
  markdown==3.7
 
1
  pandas==2.2.3
2
+ gradio==5.32.0
3
  spaces==0.34.1
4
+ boto3==1.38.5
5
  pyarrow==19.0.1
6
  openpyxl==3.1.3
7
  markdown==3.7
requirements_aws.txt CHANGED
@@ -1,7 +1,7 @@
1
  pandas==2.2.3
2
- gradio==5.23.3
3
  spaces==0.34.1
4
- boto3==1.37.29
5
  pyarrow==19.0.1
6
  openpyxl==3.1.3
7
  markdown==3.7
 
1
  pandas==2.2.3
2
+ gradio==5.32.0
3
  spaces==0.34.1
4
+ boto3==1.38.5
5
  pyarrow==19.0.1
6
  openpyxl==3.1.3
7
  markdown==3.7
requirements_gpu.txt CHANGED
@@ -1,7 +1,7 @@
1
  pandas==2.2.3
2
- gradio==5.23.3
3
  spaces==0.34.1
4
- boto3==1.37.29
5
  pyarrow==19.0.1
6
  openpyxl==3.1.3
7
  markdown==3.7
@@ -14,7 +14,8 @@ rapidfuzz==3.10.1
14
  torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu121
15
  #llama-cpp-python==0.3.4 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
16
  # Specify exact llama_cpp wheel for huggingface compatibility
17
- https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu121/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl
 
18
  transformers==4.51.1
19
  numpy==1.26.4
20
  typing_extensions==4.12.2
 
1
  pandas==2.2.3
2
+ gradio==5.32.0
3
  spaces==0.34.1
4
+ boto3==1.38.5
5
  pyarrow==19.0.1
6
  openpyxl==3.1.3
7
  markdown==3.7
 
14
  torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu121
15
  #llama-cpp-python==0.3.4 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
16
  # Specify exact llama_cpp wheel for huggingface compatibility
17
+ #https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu121/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl
18
+ https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu121/llama_cpp_python-0.3.4-cp311-cp311-win_amd64.whl # Windows
19
  transformers==4.51.1
20
  numpy==1.26.4
21
  typing_extensions==4.12.2
tools/aws_functions.py CHANGED
@@ -13,9 +13,14 @@ bucket_name=""
13
  AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
14
  print(f'The value of AWS_REGION is {AWS_REGION}')
15
 
 
 
 
 
 
16
  if RUN_AWS_FUNCTIONS == "1":
17
  try:
18
- bucket_name = os.environ['CONSULTATION_SUMMARY_BUCKET']
19
  session = boto3.Session() # profile_name="default"
20
  except Exception as e:
21
  print(e)
 
13
  AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
14
  print(f'The value of AWS_REGION is {AWS_REGION}')
15
 
16
+ CONSULTATION_SUMMARY_BUCKET = get_or_create_env_var('CONSULTATION_SUMMARY_BUCKET', '')
17
+ print(f'The value of AWS_REGION is {CONSULTATION_SUMMARY_BUCKET}')
18
+
19
+
20
+
21
  if RUN_AWS_FUNCTIONS == "1":
22
  try:
23
+ bucket_name = CONSULTATION_SUMMARY_BUCKET
24
  session = boto3.Session() # profile_name="default"
25
  except Exception as e:
26
  print(e)
tools/helper_functions.py CHANGED
@@ -1,6 +1,9 @@
1
  import os
 
2
  import gradio as gr
3
  import pandas as pd
 
 
4
 
5
  def empty_output_vars_extract_topics():
6
  # Empty output objects before processing a new file
@@ -46,22 +49,35 @@ def get_or_create_env_var(var_name, default_value):
46
 
47
  return value
48
 
49
- RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
50
  print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
51
 
52
  RUN_LOCAL_MODEL = get_or_create_env_var("RUN_LOCAL_MODEL", "1")
53
  print(f'The value of RUN_LOCAL_MODEL is {RUN_LOCAL_MODEL}')
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  if RUN_AWS_FUNCTIONS == "1":
56
- model_full_names = ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0", "gemini-2.0-flash", "gemini-1.5-pro-002", "gemma_2b_it_local"]
57
- model_short_names = ["haiku", "sonnet", "gemini_flash", "gemini_pro", "gemma_local"]
58
- else:
59
- model_full_names = ["gemini-2.0-flash", "gemini-1.5-pro-002", "gemma_2b_it_local"]
60
- model_short_names = ["gemini_flash", "gemini_pro", "gemma_local"]
 
61
 
62
- if RUN_LOCAL_MODEL == "0":
63
- model_full_names.remove("gemma_2b_it_local")
64
- model_short_names.remove("gemma_local")
65
 
66
  model_name_map = {short: full for short, full in zip(model_full_names, model_short_names)}
67
 
@@ -123,6 +139,113 @@ def read_file(filename:str, sheet:str=""):
123
  elif file_type == 'parquet':
124
  return pd.read_parquet(filename)
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  # Wrap text in each column to the specified max width, including whole words
127
  def wrap_text(text:str, max_width=60, max_text_length=None):
128
  if not isinstance(text, str):
@@ -209,6 +332,26 @@ def wrap_text(text:str, max_width=60, max_text_length=None):
209
 
210
  return '<br>'.join(wrapped_lines)
211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
  def view_table(file_path: str): # Added max_width parameter
214
  df = pd.read_csv(file_path)
@@ -234,7 +377,7 @@ def ensure_output_folder_exists():
234
  else:
235
  print(f"The 'output/' folder already exists.")
236
 
237
- def put_columns_in_df(in_file):
238
  new_choices = []
239
  concat_choices = []
240
  all_sheet_names = []
@@ -272,9 +415,9 @@ def put_columns_in_df(in_file):
272
  concat_choices = sorted(set(concat_choices))
273
 
274
  if number_of_excel_files > 0:
275
- return gr.Dropdown(choices=concat_choices, value=concat_choices[0]), gr.Dropdown(choices=all_sheet_names, value=all_sheet_names[0], visible=True, interactive=True), file_end
276
  else:
277
- return gr.Dropdown(choices=concat_choices, value=concat_choices[0]), gr.Dropdown(visible=False), file_end
278
 
279
  # Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
280
  def add_folder_to_path(folder_path: str):
 
1
  import os
2
+ import re
3
  import gradio as gr
4
  import pandas as pd
5
+ from typing import List
6
+ import math
7
 
8
  def empty_output_vars_extract_topics():
9
  # Empty output objects before processing a new file
 
49
 
50
  return value
51
 
52
+ RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "1")
53
  print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
54
 
55
  RUN_LOCAL_MODEL = get_or_create_env_var("RUN_LOCAL_MODEL", "1")
56
  print(f'The value of RUN_LOCAL_MODEL is {RUN_LOCAL_MODEL}')
57
 
58
+ RUN_GEMINI_MODELS = get_or_create_env_var("RUN_GEMINI_MODELS", "1")
59
+ print(f'The value of RUN_GEMINI_MODELS is {RUN_GEMINI_MODELS}')
60
+
61
+ GEMINI_API_KEY = get_or_create_env_var('GEMINI_API_KEY', '')
62
+
63
+ # Build up options for models
64
+ model_full_names = []
65
+ model_short_names = []
66
+
67
+ if RUN_LOCAL_MODEL == "1":
68
+ model_full_names.append("gemma_2b_it_local")
69
+ model_short_names.append("gemma_local")
70
+
71
  if RUN_AWS_FUNCTIONS == "1":
72
+ model_full_names.extend(["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"])
73
+ model_short_names.extend(["haiku", "sonnet"])
74
+
75
+ if RUN_GEMINI_MODELS == "1":
76
+ model_full_names.extend(["gemini-2.0-flash-001", "gemini-2.5-flash-preview-05-20", "gemini-2.5-pro-exp-05-06" ]) # , # Gemini pro No longer available on free tier
77
+ model_short_names.extend(["gemini_flash_2", "gemini_flash_2.5", "gemini_pro"])
78
 
79
+ print("model_short_names:", model_short_names)
80
+ print("model_full_names:", model_full_names)
 
81
 
82
  model_name_map = {short: full for short, full in zip(model_full_names, model_short_names)}
83
 
 
139
  elif file_type == 'parquet':
140
  return pd.read_parquet(filename)
141
 
142
+ def load_in_file(file_path: str, colnames:List[str]="", excel_sheet:str=""):
143
+ """
144
+ Loads in a tabular data file and returns data and file name.
145
+
146
+ Parameters:
147
+ - file_path (str): The path to the file to be processed.
148
+ - colnames (List[str], optional): list of colnames to load in
149
+ """
150
+
151
+ #file_type = detect_file_type(file_path)
152
+ #print("File type is:", file_type)
153
+
154
+ file_name = get_file_name_no_ext(file_path)
155
+ file_data = read_file(file_path, excel_sheet)
156
+
157
+ if colnames and isinstance(colnames, list):
158
+ col_list = colnames
159
+ else:
160
+ col_list = list(file_data.columns)
161
+
162
+ if not isinstance(col_list, List):
163
+ col_list = [col_list]
164
+
165
+ col_list = [item for item in col_list if item not in ["", "NA"]]
166
+
167
+ for col in col_list:
168
+ file_data[col] = file_data[col].fillna("")
169
+ file_data[col] = file_data[col].astype(str).str.replace("\bnan\b", "", regex=True)
170
+
171
+ #print(file_data[colnames])
172
+
173
+ return file_data, file_name
174
+
175
+ def load_in_data_file(file_paths:List[str], in_colnames:List[str], batch_size:int=50, in_excel_sheets:str=""):
176
+ '''Load in data table, work out how many batches needed.'''
177
+
178
+ if not isinstance(in_colnames, list):
179
+ in_colnames = [in_colnames]
180
+
181
+ #print("in_colnames:", in_colnames)
182
+
183
+ try:
184
+ file_data, file_name = load_in_file(file_paths[0], colnames=in_colnames, excel_sheet=in_excel_sheets)
185
+ num_batches = math.ceil(len(file_data) / batch_size)
186
+ print("Total number of batches:", num_batches)
187
+
188
+ except Exception as e:
189
+ print(e)
190
+ file_data = pd.DataFrame()
191
+ file_name = ""
192
+ num_batches = 1
193
+
194
+ return file_data, file_name, num_batches
195
+
196
+ def load_in_previous_reference_file(file:str):
197
+ '''Load in data table from a partially completed consultation summary to continue it.'''
198
+
199
+ reference_file_data = pd.DataFrame()
200
+ reference_file_name = ""
201
+ out_message = ""
202
+
203
+ #for file in file_paths:
204
+
205
+ print("file:", file)
206
+
207
+ # If reference table
208
+ if 'reference_table' in file:
209
+ try:
210
+ reference_file_data, reference_file_name = load_in_file(file)
211
+ #print("reference_file_data:", reference_file_data.head(2))
212
+ out_message = out_message + " Reference file load successful."
213
+ except Exception as e:
214
+ out_message = "Could not load reference file data:" + str(e)
215
+ raise Exception("Could not load reference file data:", e)
216
+
217
+ if reference_file_data.empty:
218
+ out_message = out_message + " No reference data table provided."
219
+ raise Exception(out_message)
220
+
221
+ print(out_message)
222
+
223
+ return reference_file_data, reference_file_name
224
+
225
+ def join_cols_onto_reference_df(reference_df:pd.DataFrame, original_data_df:pd.DataFrame, join_columns:List[str], original_file_name:str, output_folder:str=output_folder):
226
+
227
+ #print("original_data_df columns:", original_data_df.columns)
228
+ #print("original_data_df:", original_data_df)
229
+
230
+ original_data_df.reset_index(names="Response References", inplace=True)
231
+ original_data_df["Response References"] += 1
232
+
233
+ #print("reference_df columns:", reference_df.columns)
234
+ #print("reference_df:", reference_df)
235
+
236
+ join_columns.append("Response References")
237
+
238
+ reference_df["Response References"] = reference_df["Response References"].fillna("-1").astype(int)
239
+
240
+ save_file_name = output_folder + original_file_name + "_j.csv"
241
+
242
+ out_reference_df = reference_df.merge(original_data_df[join_columns], on = "Response References", how="left")
243
+ out_reference_df.to_csv(save_file_name, index=None)
244
+
245
+ file_data_outputs = [save_file_name]
246
+
247
+ return out_reference_df, file_data_outputs
248
+
249
  # Wrap text in each column to the specified max width, including whole words
250
  def wrap_text(text:str, max_width=60, max_text_length=None):
251
  if not isinstance(text, str):
 
332
 
333
  return '<br>'.join(wrapped_lines)
334
 
335
+ def initial_clean(text):
336
+ #### Some of my cleaning functions
337
+ html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
338
+ html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
339
+ non_ascii_pattern = r'[^\x00-\x7F]+'
340
+ multiple_spaces_regex = r'\s{2,}'
341
+
342
+ # Define a list of patterns and their replacements
343
+ patterns = [
344
+ (html_pattern_regex, ' '),
345
+ (html_start_pattern_end_dots_regex, ' '),
346
+ (non_ascii_pattern, ' '),
347
+ (multiple_spaces_regex, ' ')
348
+ ]
349
+
350
+ # Apply each regex replacement
351
+ for pattern, replacement in patterns:
352
+ text = re.sub(pattern, replacement, text)
353
+
354
+ return text
355
 
356
  def view_table(file_path: str): # Added max_width parameter
357
  df = pd.read_csv(file_path)
 
377
  else:
378
  print(f"The 'output/' folder already exists.")
379
 
380
+ def put_columns_in_df(in_file:List[str]):
381
  new_choices = []
382
  concat_choices = []
383
  all_sheet_names = []
 
415
  concat_choices = sorted(set(concat_choices))
416
 
417
  if number_of_excel_files > 0:
418
+ return gr.Dropdown(choices=concat_choices, value=concat_choices[0]), gr.Dropdown(choices=all_sheet_names, value=all_sheet_names[0], visible=True, interactive=True), file_end, gr.Dropdown(choices=concat_choices)
419
  else:
420
+ return gr.Dropdown(choices=concat_choices, value=concat_choices[0]), gr.Dropdown(visible=False), file_end, gr.Dropdown(choices=concat_choices)
421
 
422
  # Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
423
  def add_folder_to_path(folder_path: str):
tools/llm_api_call.py CHANGED
@@ -19,8 +19,8 @@ from io import StringIO
19
 
20
  GradioFileData = gr.FileData
21
 
22
- from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt, force_existing_topics_prompt, allow_new_topics_prompt
23
- from tools.helper_functions import output_folder, detect_file_type, get_file_name_no_ext, read_file, get_or_create_env_var, model_name_map, put_columns_in_df, wrap_text
24
  from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
25
 
26
  # ResponseObject class for AWS Bedrock calls
@@ -59,62 +59,6 @@ def normalise_string(text):
59
 
60
  return text
61
 
62
- def load_in_file(file_path: str, colnames:List[str]="", excel_sheet:str=""):
63
- """
64
- Loads in a tabular data file and returns data and file name.
65
-
66
- Parameters:
67
- - file_path (str): The path to the file to be processed.
68
- - colnames (List[str], optional): list of colnames to load in
69
- """
70
-
71
- file_type = detect_file_type(file_path)
72
- #print("File type is:", file_type)
73
-
74
- file_name = get_file_name_no_ext(file_path)
75
- file_data = read_file(file_path, excel_sheet)
76
-
77
- print("colnames:", colnames)
78
-
79
- if colnames and isinstance(colnames, list):
80
- col_list = colnames
81
- else:
82
- col_list = list(file_data.columns)
83
-
84
- if not isinstance(col_list, List):
85
- col_list = [col_list]
86
-
87
- col_list = [item for item in col_list if item not in ["", "NA"]]
88
-
89
- for col in col_list:
90
- file_data[col] = file_data[col].fillna("")
91
- file_data[col] = file_data[col].astype(str).str.replace("\bnan\b", "", regex=True)
92
-
93
- #print(file_data[colnames])
94
-
95
- return file_data, file_name
96
-
97
- def load_in_data_file(file_paths:List[str], in_colnames:List[str], batch_size:int=50, in_excel_sheets:str=""):
98
- '''Load in data table, work out how many batches needed.'''
99
-
100
- if not isinstance(in_colnames, list):
101
- in_colnames = [in_colnames]
102
-
103
- print("in_colnames:", in_colnames)
104
-
105
- try:
106
- file_data, file_name = load_in_file(file_paths[0], colnames=in_colnames, excel_sheet=in_excel_sheets)
107
- num_batches = math.ceil(len(file_data) / batch_size)
108
- print("Total number of batches:", num_batches)
109
-
110
- except Exception as e:
111
- print(e)
112
- file_data = pd.DataFrame()
113
- file_name = ""
114
- num_batches = 1
115
-
116
- return file_data, file_name, num_batches
117
-
118
  def load_in_previous_data_files(file_paths_partial_output:List[str], for_modified_table:bool=False):
119
  '''Load in data table from a partially completed consultation summary to continue it.'''
120
 
@@ -186,6 +130,7 @@ def load_in_previous_data_files(file_paths_partial_output:List[str], for_modifie
186
 
187
  return gr.Dataframe(value=unique_file_data, headers=None, col_count=(unique_file_data.shape[1], "fixed"), row_count = (unique_file_data.shape[0], "fixed"), visible=True, type="pandas"), reference_file_data, unique_file_data, reference_file_name, unique_file_name, out_file_names
188
 
 
189
  def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str], verify_titles:bool=False) -> pd.DataFrame:
190
 
191
  if not isinstance(chosen_cols, list):
@@ -199,10 +144,12 @@ def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str], verif
199
  if verify_titles == True:
200
  basic_response_data = basic_response_data.rename(columns={chosen_cols[0]: "Response", chosen_cols[1]: "Title"})
201
  basic_response_data["Title"] = basic_response_data["Title"].str.strip()
 
202
  else:
203
  basic_response_data = basic_response_data.rename(columns={chosen_cols[0]: "Response"})
204
 
205
  basic_response_data["Response"] = basic_response_data["Response"].str.strip()
 
206
 
207
  return basic_response_data
208
 
@@ -245,12 +192,12 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
245
  else:
246
  end_row = file_len + 1
247
 
248
- print("start_row:", start_row)
249
- print("end_row:", end_row)
250
 
251
  batch_basic_response_data = basic_response_data[start_row:end_row] # Select the current batch
252
 
253
- print("batch_basic_response_data:", batch_basic_response_data)
254
 
255
  # Now replace the reference numbers with numbers starting from 1
256
  batch_basic_response_data.loc[:, "Reference"] = batch_basic_response_data["Reference"] - start_row
@@ -398,7 +345,7 @@ def call_aws_claude(prompt: str, system_prompt: str, temperature: float, max_tok
398
 
399
  # Now you can access both the text and metadata
400
  #print("Text:", response.text)
401
- print("Metadata:", response.usage_metadata)
402
  #print("Text:", response.text)
403
 
404
  return response
@@ -428,7 +375,7 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
428
  progress_bar = range(0,number_of_api_retry_attempts)
429
 
430
  # Generate the model's response
431
- if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
432
 
433
  for i in progress_bar:
434
  try:
@@ -451,7 +398,7 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
451
 
452
  if i == number_of_api_retry_attempts:
453
  return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
454
- elif model_choice in ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]:
455
  for i in progress_bar:
456
  try:
457
  print("Calling AWS Claude model, attempt", i + 1)
@@ -661,70 +608,6 @@ def clean_markdown_table(text: str):
661
 
662
  return result
663
 
664
- # def clean_markdown_table(text: str):
665
- # lines = text.splitlines()
666
-
667
- # # Remove any empty rows or rows with only pipes
668
- # cleaned_lines = [line for line in lines if not re.match(r'^\s*\|?\s*\|?\s*$', line)]
669
-
670
- # # Merge lines that belong to the same row (i.e., don't start with |)
671
- # merged_lines = []
672
- # buffer = ""
673
-
674
- # for line in cleaned_lines:
675
- # if line.lstrip().startswith('|'): # If line starts with |, it's a new row
676
- # if buffer:
677
- # merged_lines.append(buffer) # Append the buffered content
678
- # buffer = line # Start a new buffer with this row
679
- # else:
680
- # # Continuation of the previous row
681
- # buffer += ' ' + line.strip() # Add content to the current buffer
682
-
683
- # # Don't forget to append the last buffer
684
- # if buffer:
685
- # merged_lines.append(buffer)
686
-
687
- # # Fix the header separator row if necessary
688
- # if len(merged_lines) > 1:
689
- # header_pipes = merged_lines[0].count('|') # Count pipes in the header row
690
- # header_separator = '|---|' * (header_pipes - 1) + '|---|' # Generate proper separator
691
-
692
- # # Replace or insert the separator row
693
- # if not re.match(r'^\|[-:|]+$', merged_lines[1]): # Check if the second row is a valid separator
694
- # merged_lines.insert(1, header_separator)
695
- # else:
696
- # # Adjust the separator to match the header pipes
697
- # merged_lines[1] = '|---|' * (header_pipes - 1) + '|'
698
-
699
- # # Ensure consistent number of pipes in each row
700
- # result = []
701
- # header_pipes = merged_lines[0].count('|') # Use the header row to count the number of pipes
702
-
703
- # for line in merged_lines:
704
- # # Strip excessive whitespace around pipes
705
- # line = re.sub(r'\s*\|\s*', '|', line.strip())
706
-
707
- # # Fix inconsistent number of pipes by adjusting them to match the header
708
- # pipe_count = line.count('|')
709
- # if pipe_count < header_pipes:
710
- # line += '|' * (header_pipes - pipe_count) # Add missing pipes
711
- # elif pipe_count > header_pipes:
712
- # # If too many pipes, split line and keep the first `header_pipes` columns
713
- # columns = line.split('|')[:header_pipes + 1] # +1 to keep last pipe at the end
714
- # line = '|'.join(columns)
715
-
716
- # line = re.sub(r'(\d),(?=\d)', r'\1, ', line)
717
-
718
- # result.append(line)
719
-
720
- # # Join lines back into the cleaned markdown text
721
- # cleaned_text = '\n'.join(result)
722
-
723
- # # Replace numbers next to commas and other numbers with a space
724
-
725
-
726
- # return cleaned_text
727
-
728
  def clean_column_name(column_name, max_length=20):
729
  # Convert to string
730
  column_name = str(column_name)
@@ -751,31 +634,6 @@ def create_unique_table_df_from_reference_table(reference_df:pd.DataFrame):
751
  .assign(Topic_number=lambda df: np.arange(1, len(df) + 1)) # Add numbering 1 to x
752
  )
753
 
754
- # new_unique_topics_df = reference_df[["General Topic", "Subtopic", "Sentiment"]]
755
-
756
- # new_unique_topics_df = new_unique_topics_df.rename(columns={new_unique_topics_df.columns[0]: "General Topic", new_unique_topics_df.columns[1]: "Subtopic", new_unique_topics_df.columns[2]: "Sentiment"})
757
-
758
- # # Join existing and new unique topics
759
- # out_unique_topics_df = new_unique_topics_df
760
-
761
- # out_unique_topics_df = out_unique_topics_df.rename(columns={out_unique_topics_df.columns[0]: "General Topic", out_unique_topics_df.columns[1]: "Subtopic", out_unique_topics_df.columns[2]: "Sentiment"})
762
-
763
- # #print("out_unique_topics_df:", out_unique_topics_df)
764
-
765
- # out_unique_topics_df = out_unique_topics_df.drop_duplicates(["General Topic", "Subtopic", "Sentiment"]).\
766
- # drop(["Response References", "Summary"], axis = 1, errors="ignore")
767
-
768
- # # Get count of rows that refer to particular topics
769
- # reference_counts = reference_df.groupby(["General Topic", "Subtopic", "Sentiment"]).agg({
770
- # 'Response References': 'size', # Count the number of references
771
- # 'Summary': lambda x: '<br>'.join(
772
- # sorted(set(x), key=lambda summary: reference_df.loc[reference_df['Summary'] == summary, 'Start row of group'].min())
773
- # )
774
- # }).reset_index()
775
-
776
- # # Join the counts to existing_unique_topics_df
777
- # out_unique_topics_df = out_unique_topics_df.merge(reference_counts, how='left', on=["General Topic", "Subtopic", "Sentiment"]).sort_values("Response References", ascending=False)
778
-
779
  return out_unique_topics_df
780
 
781
  # Convert output table to markdown and then to a pandas dataframe to csv
@@ -933,8 +791,6 @@ def call_llm_with_markdown_table_checks(batch_prompts: List[str],
933
  call_temperature, reported_batch_no, local_model, master=master
934
  )
935
 
936
- print("Responses:", responses)
937
-
938
  if (model_choice != "gemma_local") & (model_choice != "gemma_2b_it_local"):
939
  stripped_response = responses[-1].text.strip()
940
  else:
@@ -1041,7 +897,16 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
1041
  return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
1042
 
1043
  # Rename columns to ensure consistent use of data frames later in code
1044
- topic_with_response_df.columns = ["General Topic", "Subtopic", "Sentiment", "Response References", "Summary"]
 
 
 
 
 
 
 
 
 
1045
 
1046
  # Fill in NA rows with values from above (topics seem to be included only on one row):
1047
  topic_with_response_df = topic_with_response_df.ffill()
@@ -1073,8 +938,8 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
1073
  sentiment = row.iloc[2] if pd.notna(row.iloc[2]) else ""
1074
  summary = row.iloc[4] if pd.notna(row.iloc[4]) else ""
1075
  # If the reference response column is very long, and there's nothing in the summary column, assume that the summary was put in the reference column
1076
- if not summary and len(row.iloc[3] > 30):
1077
- summary = row.iloc[3]
1078
 
1079
  summary = row_number_string_start + summary
1080
 
@@ -1151,6 +1016,128 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
1151
 
1152
  return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
1153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1154
  @spaces.GPU
1155
  def extract_topics(in_data_file,
1156
  file_data:pd.DataFrame,
@@ -1184,6 +1171,8 @@ def extract_topics(in_data_file,
1184
  sentiment_checkbox:str = "Negative, Neutral, or Positive",
1185
  force_zero_shot_radio:str = "No",
1186
  in_excel_sheets:List[str] = [],
 
 
1187
  max_tokens:int=max_tokens,
1188
  model_name_map:dict=model_name_map,
1189
  max_time_for_loop:int=max_time_for_loop,
@@ -1224,7 +1213,9 @@ def extract_topics(in_data_file,
1224
  - time_taken (float, optional): The amount of time taken to process the responses up until this point.
1225
  - sentiment_checkbox (str, optional): What type of sentiment analysis should the topic modeller do?
1226
  - force_zero_shot_radio (str, optional): Should responses be forced into a zero shot topic or not.
1227
- - in_excel_sheets (List[str], optional): List of excel sheets to load from input file
 
 
1228
  - max_tokens (int): The maximum number of tokens for the model.
1229
  - model_name_map (dict, optional): A dictionary mapping full model name to shortened.
1230
  - max_time_for_loop (int, optional): The number of seconds maximum that the function should run for before breaking (to run again, this is to avoid timeouts with some AWS services if deployed there).
@@ -1254,17 +1245,13 @@ def extract_topics(in_data_file,
1254
  if file_data.empty:
1255
  print("No data table found, loading from file")
1256
  try:
1257
- #print("in_data_file:", in_data_file)
1258
  in_colnames_drop, in_excel_sheets, file_name = put_columns_in_df(in_data_file)
1259
- #print("in_colnames:", in_colnames_drop)
1260
  file_data, file_name, num_batches = load_in_data_file(in_data_file, chosen_cols, batch_size_default, in_excel_sheets)
1261
- #print("file_data loaded in:", file_data)
1262
  except:
1263
  # Check if files and text exist
1264
  out_message = "Please enter a data file to summarise."
1265
  print(out_message)
1266
  raise Exception(out_message)
1267
- #return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths#, out_message
1268
 
1269
 
1270
  #model_choice_clean = replace_punctuation_with_underscore(model_choice)
@@ -1277,12 +1264,10 @@ def extract_topics(in_data_file,
1277
  latest_batch_completed = 0
1278
  out_message = []
1279
  out_file_paths = []
1280
- #print("model_choice_clean:", model_choice_clean)
1281
 
1282
  if (model_choice == "gemma_2b_it_local") & (RUN_LOCAL_MODEL == "1"):
1283
  progress(0.1, "Loading in Gemma 2b model")
1284
  local_model, tokenizer = load_model()
1285
- print("Local model loaded:", local_model)
1286
 
1287
  if num_batches > 0:
1288
  progress_measure = round(latest_batch_completed / num_batches, 1)
@@ -1301,12 +1286,10 @@ def extract_topics(in_data_file,
1301
  out_file_paths = []
1302
 
1303
 
1304
- if model_choice == "anthropic.claude-3-sonnet-20240229-v1:0" and file_data.shape[1] > 300:
1305
  out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
1306
  print(out_message)
1307
- raise Exception(out_message)
1308
- #return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths#, out_message
1309
-
1310
 
1311
  if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
1312
  elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative or Positive"
@@ -1337,10 +1320,10 @@ def extract_topics(in_data_file,
1337
  if latest_batch_completed >= 1 or candidate_topics is not None:
1338
 
1339
  # Prepare Gemini models before query
1340
- if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
1341
  print("Using Gemini model:", model_choice)
1342
  model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens)
1343
- elif model_choice in ["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"]:
1344
  print("Using AWS Bedrock model:", model_choice)
1345
  else:
1346
  print("Using local model:", model_choice)
@@ -1351,109 +1334,17 @@ def extract_topics(in_data_file,
1351
 
1352
  # 'Zero shot topics' are those supplied by the user
1353
  max_topic_no = 120
1354
- zero_shot_topics = read_file(candidate_topics.name)
1355
-
1356
- # Max 120 topics allowed
1357
- if zero_shot_topics.shape[0] > max_topic_no:
1358
- print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
1359
- zero_shot_topics = zero_shot_topics.iloc[:max_topic_no, :]
1360
-
1361
- # Forward slashes in the topic names seems to confuse the model
1362
- if zero_shot_topics.shape[1] >= 1: # Check if there is at least one column
1363
- for x in zero_shot_topics.columns:
1364
- zero_shot_topics.loc[:, x] = (
1365
- zero_shot_topics.loc[:, x]
1366
- .str.strip()
1367
- .str.replace('\n', ' ')
1368
- .str.replace('\r', ' ')
1369
- .str.replace('/', ' or ')
1370
- .str.lower()
1371
- .str.capitalize())
1372
-
1373
- # If number of columns is 1, keep only subtopics
1374
- if zero_shot_topics.shape[1] == 1 and "General Topic" not in zero_shot_topics.columns:
1375
- zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
1376
- zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
1377
- # Allow for possibility that the user only wants to set general topics and not subtopics
1378
- elif zero_shot_topics.shape[1] == 1 and "General Topic" in zero_shot_topics.columns:
1379
- zero_shot_topics_gen_topics_list = list(zero_shot_topics["General Topic"])
1380
- zero_shot_topics_subtopics_list = [""] * zero_shot_topics.shape[0]
1381
- # If general topic and subtopic are specified
1382
- elif set(["General Topic", "Subtopic"]).issubset(zero_shot_topics.columns):
1383
- zero_shot_topics_gen_topics_list = list(zero_shot_topics["General Topic"])
1384
- zero_shot_topics_subtopics_list = list(zero_shot_topics["Subtopic"])
1385
- # If number of columns is 2, keep general topics and subtopics
1386
- elif zero_shot_topics.shape[1] == 2:
1387
- zero_shot_topics_gen_topics_list = list(zero_shot_topics.iloc[:, 0])
1388
- zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 1])
1389
- else:
1390
- # If there are more columns, just assume that the first column was meant to be a subtopic
1391
- zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
1392
- zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
1393
-
1394
- # If the responses are being forced into zero shot topics, allow an option for nothing relevant
1395
- if force_zero_shot_radio == "Yes":
1396
- zero_shot_topics_gen_topics_list.append("")
1397
- zero_shot_topics_subtopics_list.append("No topics are relevant to the response")
1398
-
1399
- if create_revised_general_topics == True:
1400
- # Create the most up to date list of topics and subtopics.
1401
- # If there are candidate topics, but the existing_unique_topics_df hasn't yet been constructed, then create.
1402
- unique_topics_df = pd.DataFrame(data={
1403
- "General Topic":zero_shot_topics_gen_topics_list,
1404
- "Subtopic":zero_shot_topics_subtopics_list
1405
- })
1406
- unique_topics_markdown = unique_topics_df.to_markdown()
1407
-
1408
- print("unique_topics_markdown:", unique_topics_markdown)
1409
-
1410
- formatted_general_topics_system_prompt = create_general_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
1411
-
1412
- # Format the general_topics prompt with the topics
1413
- formatted_general_topics_prompt = create_general_topics_prompt.format(topics=unique_topics_markdown)
1414
-
1415
- if model_choice == "gemma_2b_it_local":
1416
- formatted_general_topics_prompt = llama_cpp_prefix + formatted_general_topics_system_prompt + "\n" + formatted_general_topics_prompt + llama_cpp_suffix
1417
-
1418
- formatted_general_topics_prompt_list = [formatted_general_topics_prompt]
1419
-
1420
-
1421
-
1422
- whole_conversation = []
1423
-
1424
- general_topic_response, general_topic_conversation_history, general_topic_conversation, general_topic_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
1425
-
1426
- # Convert response text to a markdown table
1427
- try:
1428
- zero_shot_topics_df, is_error = convert_response_text_to_markdown_table(response_text, table_type = "Revised topics table")
1429
- print("Output revised zero shot topics table is:", zero_shot_topics_df)
1430
-
1431
- zero_shot_revised_path = output_folder + "zero_shot_topics_with_general_topics.csv"
1432
- #zero_shot_topics_df.to_csv(zero_shot_revised_path, index = None)
1433
- out_file_paths.append(zero_shot_revised_path)
1434
-
1435
- except Exception as e:
1436
- print("Error in parsing markdown table from response text:", e, "Not adding revised General Topics to table")
1437
- zero_shot_topics_df = pd.DataFrame(data={
1438
- "General Topic":zero_shot_topics_gen_topics_list,
1439
- "Subtopic":zero_shot_topics_subtopics_list})
1440
-
1441
- if zero_shot_topics_df.empty:
1442
- print("Creation of revised general topics df failed, reverting to original list")
1443
- zero_shot_topics_df = pd.DataFrame(data={
1444
- "General Topic":zero_shot_topics_gen_topics_list,
1445
- "Subtopic":zero_shot_topics_subtopics_list})
1446
- else:
1447
- zero_shot_topics_df = pd.DataFrame(data={
1448
- "General Topic":zero_shot_topics_gen_topics_list,
1449
- "Subtopic":zero_shot_topics_subtopics_list})
1450
-
1451
-
1452
- # This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
1453
- if not existing_unique_topics_df.empty:
1454
- existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df]).drop_duplicates("Subtopic")
1455
- else:
1456
- existing_unique_topics_df = zero_shot_topics_df
1457
 
1458
  if candidate_topics and not zero_shot_topics_df.empty:
1459
  # If you have already created revised zero shot topics, concat to the current
@@ -1464,24 +1355,40 @@ def extract_topics(in_data_file,
1464
  existing_unique_topics_df.fillna("", inplace=True)
1465
  existing_unique_topics_df["General Topic"] = existing_unique_topics_df["General Topic"].str.replace('(?i)^Nan$', '', regex=True)
1466
  existing_unique_topics_df["Subtopic"] = existing_unique_topics_df["Subtopic"].str.replace('(?i)^Nan$', '', regex=True)
 
 
 
 
1467
 
1468
  # print("existing_unique_topics_df:", existing_unique_topics_df)
1469
 
1470
  # If user has chosen to try to force zero shot topics, then the prompt is changed to ask the model not to deviate at all from submitted topic list.
1471
- if force_zero_shot_radio == "Yes":
1472
- unique_topics_markdown = existing_unique_topics_df[["Subtopic"]].drop_duplicates(["Subtopic"]).to_markdown(index=False)
 
 
 
 
 
 
 
1473
  topic_assignment_prompt = force_existing_topics_prompt
1474
  else:
1475
- unique_topics_markdown = existing_unique_topics_df[["General Topic", "Subtopic"]].drop_duplicates(["General Topic", "Subtopic"]).to_markdown(index=False)
1476
- topic_assignment_prompt = allow_new_topics_prompt
1477
-
 
 
 
 
 
1478
 
1479
  # Format the summary prompt with the response table and topics
1480
  formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
1481
- formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown, topic_assignment=topic_assignment_prompt, sentiment_choices=sentiment_prompt)
1482
 
1483
 
1484
- if model_choice == "gemma_2b_it_local":
1485
  formatted_summary_prompt = llama_cpp_prefix + formatted_system_prompt + "\n" + formatted_summary_prompt + llama_cpp_suffix
1486
  full_prompt = formatted_summary_prompt
1487
  else:
@@ -1499,7 +1406,7 @@ def extract_topics(in_data_file,
1499
  except Exception as e:
1500
  print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
1501
 
1502
- if model_choice == "gemma_2b_it_local":
1503
  summary_prompt_list = [full_prompt] # Includes system prompt
1504
  else:
1505
  summary_prompt_list = [formatted_summary_prompt]
@@ -1510,13 +1417,9 @@ def extract_topics(in_data_file,
1510
  whole_conversation = []
1511
 
1512
  # Process requests to large language model
1513
- # responses, summary_conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(summary_prompt_list, add_existing_topics_system_prompt, summary_conversation_history, summary_whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, master = True)
1514
-
1515
  responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
1516
 
1517
- # print("responses:", responses[-1].text)
1518
- # print("Whole conversation metadata:", whole_conversation_metadata)
1519
-
1520
  topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, new_topic_df, new_markdown_table, new_reference_df, new_unique_topics_df, master_batch_out_file_part, is_error = write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, first_run=False)
1521
 
1522
  # Write final output to text file for logging purposes
@@ -1541,7 +1444,6 @@ def extract_topics(in_data_file,
1541
  if is_error == True:
1542
  final_message_out = "Could not complete summary, error in LLM output."
1543
  raise Exception(final_message_out)
1544
- #return unique_table_df_display_table_markdown, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
1545
 
1546
  # Write outputs to csv
1547
  ## Topics with references
@@ -1560,7 +1462,7 @@ def extract_topics(in_data_file,
1560
 
1561
  # Outputs for markdown table output
1562
  unique_table_df_display_table = new_unique_topics_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
1563
- unique_table_df_display_table_markdown = unique_table_df_display_table.to_markdown(index=False)
1564
 
1565
  #whole_conversation_metadata.append(whole_conversation_metadata_str)
1566
  whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
@@ -1579,11 +1481,11 @@ def extract_topics(in_data_file,
1579
  #system_prompt = system_prompt + normalised_simple_markdown_table
1580
 
1581
  # Prepare Gemini models before query
1582
- if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
1583
  print("Using Gemini model:", model_choice)
1584
  model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
1585
- elif model_choice in ["gemma_2b_it_local"]:
1586
- print("Using local Gemma 2b model")
1587
  else:
1588
  print("Using AWS Bedrock model:", model_choice)
1589
 
@@ -1597,7 +1499,7 @@ def extract_topics(in_data_file,
1597
  if prompt3: formatted_prompt3 = prompt3.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt)
1598
  else: formatted_prompt3 = prompt3
1599
 
1600
- if model_choice == "gemma_2b_it_local":
1601
  formatted_initial_table_prompt = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_initial_table_prompt + llama_cpp_suffix
1602
  formatted_prompt2 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt2 + llama_cpp_suffix
1603
  formatted_prompt3 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt3 + llama_cpp_suffix
@@ -1703,6 +1605,8 @@ def extract_topics(in_data_file,
1703
  # Set to a very high number so as not to mess with subsequent file processing by the user
1704
  #latest_batch_completed = 999
1705
 
 
 
1706
  toc = time.perf_counter()
1707
  final_time = (toc - tic) + time_taken
1708
  out_time = f"Everything finished in {round(final_time,1)} seconds."
@@ -1733,6 +1637,7 @@ def extract_topics(in_data_file,
1733
  ## Reference table mapping response numbers to topics
1734
  existing_reference_df.to_csv(reference_table_out_path, index=None)
1735
  out_file_paths.append(reference_table_out_path)
 
1736
 
1737
  # Create final unique topics table from reference table to ensure consistent numbers
1738
  final_out_unique_topics_df = create_unique_table_df_from_reference_table(existing_reference_df)
@@ -1787,13 +1692,10 @@ def extract_topics(in_data_file,
1787
 
1788
  print("latest_batch_completed at end of batch iterations to return is", latest_batch_completed)
1789
 
1790
- return unique_table_df_display_table_markdown, existing_topics_table, final_out_unique_topics_df, existing_reference_df, final_out_file_paths, final_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, final_out_file_paths, final_out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), final_out_file_paths
1791
-
1792
-
1793
- return unique_table_df_display_table_markdown, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths
1794
-
1795
 
1796
 
 
1797
 
1798
  def convert_reference_table_to_pivot_table(df:pd.DataFrame, basic_response_data:pd.DataFrame=pd.DataFrame()):
1799
 
@@ -2302,7 +2204,7 @@ def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:
2302
  whole_conversation_metadata = []
2303
 
2304
  # Prepare Gemini models before query
2305
- if model_choice in ["gemini-2.0-flash", "gemini-1.5-pro-002"]:
2306
  print("Using Gemini model:", model_choice)
2307
  model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
2308
  else:
@@ -2464,7 +2366,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
2464
  if (model_choice == "gemma_2b_it_local") & (RUN_LOCAL_MODEL == "1"):
2465
  progress(0.1, "Loading in Gemma 2b model")
2466
  local_model, tokenizer = load_model()
2467
- print("Local model loaded:", local_model)
2468
 
2469
  summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
2470
  summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")
 
19
 
20
  GradioFileData = gr.FileData
21
 
22
+ from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt, force_existing_topics_prompt, allow_new_topics_prompt, force_single_topic_prompt
23
+ from tools.helper_functions import output_folder, read_file, get_or_create_env_var, model_name_map, put_columns_in_df, wrap_text, initial_clean, load_in_data_file, load_in_file
24
  from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
25
 
26
  # ResponseObject class for AWS Bedrock calls
 
59
 
60
  return text
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  def load_in_previous_data_files(file_paths_partial_output:List[str], for_modified_table:bool=False):
63
  '''Load in data table from a partially completed consultation summary to continue it.'''
64
 
 
130
 
131
  return gr.Dataframe(value=unique_file_data, headers=None, col_count=(unique_file_data.shape[1], "fixed"), row_count = (unique_file_data.shape[0], "fixed"), visible=True, type="pandas"), reference_file_data, unique_file_data, reference_file_name, unique_file_name, out_file_names
132
 
133
+
134
  def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str], verify_titles:bool=False) -> pd.DataFrame:
135
 
136
  if not isinstance(chosen_cols, list):
 
144
  if verify_titles == True:
145
  basic_response_data = basic_response_data.rename(columns={chosen_cols[0]: "Response", chosen_cols[1]: "Title"})
146
  basic_response_data["Title"] = basic_response_data["Title"].str.strip()
147
+ basic_response_data["Title"] = basic_response_data["Title"].apply(initial_clean)
148
  else:
149
  basic_response_data = basic_response_data.rename(columns={chosen_cols[0]: "Response"})
150
 
151
  basic_response_data["Response"] = basic_response_data["Response"].str.strip()
152
+ basic_response_data["Response"] = basic_response_data["Response"].apply(initial_clean)
153
 
154
  return basic_response_data
155
 
 
192
  else:
193
  end_row = file_len + 1
194
 
195
+ #print("start_row:", start_row)
196
+ #print("end_row:", end_row)
197
 
198
  batch_basic_response_data = basic_response_data[start_row:end_row] # Select the current batch
199
 
200
+ #print("batch_basic_response_data:", batch_basic_response_data)
201
 
202
  # Now replace the reference numbers with numbers starting from 1
203
  batch_basic_response_data.loc[:, "Reference"] = batch_basic_response_data["Reference"] - start_row
 
345
 
346
  # Now you can access both the text and metadata
347
  #print("Text:", response.text)
348
+ #print("Metadata:", response.usage_metadata)
349
  #print("Text:", response.text)
350
 
351
  return response
 
375
  progress_bar = range(0,number_of_api_retry_attempts)
376
 
377
  # Generate the model's response
378
+ if "gemini" in model_choice:
379
 
380
  for i in progress_bar:
381
  try:
 
398
 
399
  if i == number_of_api_retry_attempts:
400
  return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
401
+ elif "anthropic.claude" in model_choice:
402
  for i in progress_bar:
403
  try:
404
  print("Calling AWS Claude model, attempt", i + 1)
 
608
 
609
  return result
610
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611
  def clean_column_name(column_name, max_length=20):
612
  # Convert to string
613
  column_name = str(column_name)
 
634
  .assign(Topic_number=lambda df: np.arange(1, len(df) + 1)) # Add numbering 1 to x
635
  )
636
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
637
  return out_unique_topics_df
638
 
639
  # Convert output table to markdown and then to a pandas dataframe to csv
 
791
  call_temperature, reported_batch_no, local_model, master=master
792
  )
793
 
 
 
794
  if (model_choice != "gemma_local") & (model_choice != "gemma_2b_it_local"):
795
  stripped_response = responses[-1].text.strip()
796
  else:
 
897
  return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
898
 
899
  # Rename columns to ensure consistent use of data frames later in code
900
+ new_column_names = {
901
+ topic_with_response_df.columns[0]: "General Topic",
902
+ topic_with_response_df.columns[1]: "Subtopic",
903
+ topic_with_response_df.columns[2]: "Sentiment",
904
+ topic_with_response_df.columns[3]: "Response References",
905
+ topic_with_response_df.columns[4]: "Summary"
906
+ }
907
+
908
+ topic_with_response_df = topic_with_response_df.rename(columns=new_column_names)
909
+
910
 
911
  # Fill in NA rows with values from above (topics seem to be included only on one row):
912
  topic_with_response_df = topic_with_response_df.ffill()
 
938
  sentiment = row.iloc[2] if pd.notna(row.iloc[2]) else ""
939
  summary = row.iloc[4] if pd.notna(row.iloc[4]) else ""
940
  # If the reference response column is very long, and there's nothing in the summary column, assume that the summary was put in the reference column
941
+ if not summary and len(str(row.iloc[3]) > 30):
942
+ summary = row.iloc[3]
943
 
944
  summary = row_number_string_start + summary
945
 
 
1016
 
1017
  return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
1018
 
1019
+ def generate_zero_shot_topics_df(zero_shot_topics:pd.DataFrame,
1020
+ force_zero_shot_radio:str="No",
1021
+ create_revised_general_topics:bool=False,
1022
+ max_topic_no:int=120):
1023
+
1024
+ # Max 120 topics allowed
1025
+ if zero_shot_topics.shape[0] > max_topic_no:
1026
+ print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
1027
+ zero_shot_topics = zero_shot_topics.iloc[:max_topic_no, :]
1028
+
1029
+ # Forward slashes in the topic names seems to confuse the model
1030
+ if zero_shot_topics.shape[1] >= 1: # Check if there is at least one column
1031
+ for x in zero_shot_topics.columns:
1032
+ if not zero_shot_topics[x].isnull().all():
1033
+ zero_shot_topics[x] = zero_shot_topics[x].apply(initial_clean)
1034
+
1035
+ zero_shot_topics.loc[:, x] = (
1036
+ zero_shot_topics.loc[:, x]
1037
+ .str.strip()
1038
+ .str.replace('\n', ' ')
1039
+ .str.replace('\r', ' ')
1040
+ .str.replace('/', ' or ')
1041
+ .str.lower()
1042
+ .str.capitalize())
1043
+
1044
+ #print("zero_shot_topics:", zero_shot_topics)
1045
+
1046
+ # If number of columns is 1, keep only subtopics
1047
+ if zero_shot_topics.shape[1] == 1 and "General topic" not in zero_shot_topics.columns:
1048
+ zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
1049
+ zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
1050
+ # Allow for possibility that the user only wants to set general topics and not subtopics
1051
+ elif zero_shot_topics.shape[1] == 1 and "General topic" in zero_shot_topics.columns:
1052
+ zero_shot_topics_gen_topics_list = list(zero_shot_topics["General Topic"])
1053
+ zero_shot_topics_subtopics_list = [""] * zero_shot_topics.shape[0]
1054
+ # If general topic and subtopic are specified
1055
+ elif set(["General topic", "Subtopic"]).issubset(zero_shot_topics.columns):
1056
+ print("Found General topic and Subtopic in zero shot topics")
1057
+ zero_shot_topics_gen_topics_list = list(zero_shot_topics["General topic"])
1058
+ zero_shot_topics_subtopics_list = list(zero_shot_topics["Subtopic"])
1059
+
1060
+ # If number of columns is at least 2, keep general topics and subtopics
1061
+ elif zero_shot_topics.shape[1] >= 2 and "Description" not in zero_shot_topics.columns:
1062
+ zero_shot_topics_gen_topics_list = list(zero_shot_topics.iloc[:, 0])
1063
+ zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 1])
1064
+ else:
1065
+ # If there are more columns, just assume that the first column was meant to be a subtopic
1066
+ zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
1067
+ zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
1068
+
1069
+ # Add a description if column is present
1070
+ # print("zero_shot_topics.shape[1]:", zero_shot_topics.shape[1])
1071
+ if "Description" in zero_shot_topics.columns:
1072
+ zero_shot_topics_description_list = list(zero_shot_topics["Description"])
1073
+ #print("Description found in topic title. List is:", zero_shot_topics_description_list)
1074
+ elif zero_shot_topics.shape[1] >= 3:
1075
+ zero_shot_topics_description_list = list(zero_shot_topics.iloc[:, 2]) # Assume the third column is description
1076
+ else:
1077
+ zero_shot_topics_description_list = [""] * zero_shot_topics.shape[0]
1078
+
1079
+ # If the responses are being forced into zero shot topics, allow an option for nothing relevant
1080
+ if force_zero_shot_radio == "Yes":
1081
+ zero_shot_topics_gen_topics_list.append("")
1082
+ zero_shot_topics_subtopics_list.append("No relevant topic")
1083
+ zero_shot_topics_description_list.append("")
1084
+
1085
+ if create_revised_general_topics == True:
1086
+ pass
1087
+
1088
+ # The following currently doesn't really work. Excluded for now.
1089
+
1090
+ # unique_topics_df = pd.DataFrame(data={
1091
+ # "General Topic":zero_shot_topics_gen_topics_list,
1092
+ # "Subtopic":zero_shot_topics_subtopics_list,
1093
+ # "Description": zero_shot_topics_description_list
1094
+ # })
1095
+ # unique_topics_markdown = unique_topics_df.to_markdown()
1096
+
1097
+ # #print("unique_topics_markdown:", unique_topics_markdown)
1098
+
1099
+ # formatted_general_topics_system_prompt = create_general_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
1100
+
1101
+ # # Format the general_topics prompt with the topics
1102
+ # formatted_general_topics_prompt = create_general_topics_prompt.format(topics=unique_topics_markdown)
1103
+
1104
+ # if "gemma" in model_choice:
1105
+ # formatted_general_topics_prompt = llama_cpp_prefix + formatted_general_topics_system_prompt + "\n" + formatted_general_topics_prompt + llama_cpp_suffix
1106
+
1107
+ # formatted_general_topics_prompt_list = [formatted_general_topics_prompt]
1108
+
1109
+ # whole_conversation = []
1110
+
1111
+ # general_topic_response, general_topic_conversation_history, general_topic_conversation, general_topic_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
1112
+
1113
+ # # Convert response text to a markdown table
1114
+ # try:
1115
+ # zero_shot_topics_df, is_error = convert_response_text_to_markdown_table(response_text, table_type = "Revised topics table")
1116
+ # print("Output revised zero shot topics table is:", zero_shot_topics_df)
1117
+
1118
+ # zero_shot_revised_path = output_folder + "zero_shot_topics_with_general_topics.csv"
1119
+ # out_file_paths.append(zero_shot_revised_path)
1120
+
1121
+ # except Exception as e:
1122
+ # print("Error in parsing markdown table from response text:", e, "Not adding revised General Topics to table")
1123
+
1124
+ # if zero_shot_topics_df.empty:
1125
+ # print("Creation of revised general topics df failed, reverting to original list")
1126
+ else:
1127
+ pass
1128
+
1129
+ # Add description or not
1130
+ zero_shot_topics_df = pd.DataFrame(data={
1131
+ "General Topic":zero_shot_topics_gen_topics_list,
1132
+ "Subtopic":zero_shot_topics_subtopics_list,
1133
+ "Description": zero_shot_topics_description_list
1134
+ })
1135
+
1136
+ #if not zero_shot_topics_df["Description"].isnull().all():
1137
+ # zero_shot_topics_df["Description"] = zero_shot_topics_df["Description"].apply(initial_clean)
1138
+
1139
+ return zero_shot_topics_df
1140
+
1141
  @spaces.GPU
1142
  def extract_topics(in_data_file,
1143
  file_data:pd.DataFrame,
 
1171
  sentiment_checkbox:str = "Negative, Neutral, or Positive",
1172
  force_zero_shot_radio:str = "No",
1173
  in_excel_sheets:List[str] = [],
1174
+ force_single_topic_radio:str = "No",
1175
+ force_single_topic_prompt:str=force_single_topic_prompt,
1176
  max_tokens:int=max_tokens,
1177
  model_name_map:dict=model_name_map,
1178
  max_time_for_loop:int=max_time_for_loop,
 
1213
  - time_taken (float, optional): The amount of time taken to process the responses up until this point.
1214
  - sentiment_checkbox (str, optional): What type of sentiment analysis should the topic modeller do?
1215
  - force_zero_shot_radio (str, optional): Should responses be forced into a zero shot topic or not.
1216
+ - in_excel_sheets (List[str], optional): List of excel sheets to load from input file.
1217
+ - force_single_topic_radio (str, optional): Should the model be forced to assign only one single topic to each response (effectively a classifier).
1218
+ - force_single_topic_prompt (str, optional): The prompt for forcing the model to assign only one single topic to each response.
1219
  - max_tokens (int): The maximum number of tokens for the model.
1220
  - model_name_map (dict, optional): A dictionary mapping full model name to shortened.
1221
  - max_time_for_loop (int, optional): The number of seconds maximum that the function should run for before breaking (to run again, this is to avoid timeouts with some AWS services if deployed there).
 
1245
  if file_data.empty:
1246
  print("No data table found, loading from file")
1247
  try:
 
1248
  in_colnames_drop, in_excel_sheets, file_name = put_columns_in_df(in_data_file)
 
1249
  file_data, file_name, num_batches = load_in_data_file(in_data_file, chosen_cols, batch_size_default, in_excel_sheets)
 
1250
  except:
1251
  # Check if files and text exist
1252
  out_message = "Please enter a data file to summarise."
1253
  print(out_message)
1254
  raise Exception(out_message)
 
1255
 
1256
 
1257
  #model_choice_clean = replace_punctuation_with_underscore(model_choice)
 
1264
  latest_batch_completed = 0
1265
  out_message = []
1266
  out_file_paths = []
 
1267
 
1268
  if (model_choice == "gemma_2b_it_local") & (RUN_LOCAL_MODEL == "1"):
1269
  progress(0.1, "Loading in Gemma 2b model")
1270
  local_model, tokenizer = load_model()
 
1271
 
1272
  if num_batches > 0:
1273
  progress_measure = round(latest_batch_completed / num_batches, 1)
 
1286
  out_file_paths = []
1287
 
1288
 
1289
+ if "anthropic.claude-3-sonnet" in model_choice and file_data.shape[1] > 300:
1290
  out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
1291
  print(out_message)
1292
+ raise Exception(out_message)
 
 
1293
 
1294
  if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
1295
  elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative or Positive"
 
1320
  if latest_batch_completed >= 1 or candidate_topics is not None:
1321
 
1322
  # Prepare Gemini models before query
1323
+ if "gemini" in model_choice:
1324
  print("Using Gemini model:", model_choice)
1325
  model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens)
1326
+ elif "anthropic.claude" in model_choice:
1327
  print("Using AWS Bedrock model:", model_choice)
1328
  else:
1329
  print("Using local model:", model_choice)
 
1334
 
1335
  # 'Zero shot topics' are those supplied by the user
1336
  max_topic_no = 120
1337
+ zero_shot_topics = read_file(candidate_topics.name)
1338
+
1339
+ zero_shot_topics_df = generate_zero_shot_topics_df(zero_shot_topics, force_zero_shot_radio, create_revised_general_topics, max_topic_no)
1340
+
1341
+ #print("zero_shot_topics_df:", zero_shot_topics_df)
1342
+
1343
+ # This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
1344
+ if not existing_unique_topics_df.empty and force_zero_shot_radio != "Yes":
1345
+ existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df]).drop_duplicates("Subtopic")
1346
+ else:
1347
+ existing_unique_topics_df = zero_shot_topics_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1348
 
1349
  if candidate_topics and not zero_shot_topics_df.empty:
1350
  # If you have already created revised zero shot topics, concat to the current
 
1355
  existing_unique_topics_df.fillna("", inplace=True)
1356
  existing_unique_topics_df["General Topic"] = existing_unique_topics_df["General Topic"].str.replace('(?i)^Nan$', '', regex=True)
1357
  existing_unique_topics_df["Subtopic"] = existing_unique_topics_df["Subtopic"].str.replace('(?i)^Nan$', '', regex=True)
1358
+ existing_unique_topics_df = existing_unique_topics_df.drop_duplicates()
1359
+ if "Description" in existing_unique_topics_df:
1360
+ if existing_unique_topics_df['Description'].isnull().all():
1361
+ existing_unique_topics_df.drop("Description", axis = 1, inplace = True)
1362
 
1363
  # print("existing_unique_topics_df:", existing_unique_topics_df)
1364
 
1365
  # If user has chosen to try to force zero shot topics, then the prompt is changed to ask the model not to deviate at all from submitted topic list.
1366
+ keep_cols = [
1367
+ col for col in ["General Topic", "Subtopic", "Description"]
1368
+ if col in existing_unique_topics_df.columns
1369
+ and not existing_unique_topics_df[col].replace(r'^\s*$', pd.NA, regex=True).isna().all()
1370
+ ]
1371
+
1372
+ if force_zero_shot_radio == "Yes":
1373
+ topics_df_for_markdown = existing_unique_topics_df[keep_cols].drop_duplicates(keep_cols)
1374
+ unique_topics_markdown = topics_df_for_markdown.to_markdown(index=False)
1375
  topic_assignment_prompt = force_existing_topics_prompt
1376
  else:
1377
+ topics_df_for_markdown = existing_unique_topics_df[keep_cols].drop_duplicates(keep_cols)
1378
+ unique_topics_markdown = topics_df_for_markdown.to_markdown(index=False)
1379
+ topic_assignment_prompt = allow_new_topics_prompt
1380
+
1381
+ # Should the outputs force only one single topic assignment per response?
1382
+ if force_single_topic_radio != "Yes": force_single_topic_prompt = ""
1383
+ else:
1384
+ topic_assignment_prompt = topic_assignment_prompt.replace("Assign topics", "Assign a topic").replace("assign Subtopics", "assign a Subtopic").replace("Subtopics", "Subtopic").replace("Topics", "Topic").replace("topics", "a topic")
1385
 
1386
  # Format the summary prompt with the response table and topics
1387
  formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
1388
+ formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown, topic_assignment=topic_assignment_prompt, force_single_topic=force_single_topic_prompt, sentiment_choices=sentiment_prompt)
1389
 
1390
 
1391
+ if "gemma" in model_choice:
1392
  formatted_summary_prompt = llama_cpp_prefix + formatted_system_prompt + "\n" + formatted_summary_prompt + llama_cpp_suffix
1393
  full_prompt = formatted_summary_prompt
1394
  else:
 
1406
  except Exception as e:
1407
  print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
1408
 
1409
+ if "gemma" in model_choice:
1410
  summary_prompt_list = [full_prompt] # Includes system prompt
1411
  else:
1412
  summary_prompt_list = [formatted_summary_prompt]
 
1417
  whole_conversation = []
1418
 
1419
  # Process requests to large language model
 
 
1420
  responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
1421
 
1422
+ # Return output tables
 
 
1423
  topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, new_topic_df, new_markdown_table, new_reference_df, new_unique_topics_df, master_batch_out_file_part, is_error = write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, first_run=False)
1424
 
1425
  # Write final output to text file for logging purposes
 
1444
  if is_error == True:
1445
  final_message_out = "Could not complete summary, error in LLM output."
1446
  raise Exception(final_message_out)
 
1447
 
1448
  # Write outputs to csv
1449
  ## Topics with references
 
1462
 
1463
  # Outputs for markdown table output
1464
  unique_table_df_display_table = new_unique_topics_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
1465
+ unique_table_df_display_table_markdown = unique_table_df_display_table[["General Topic", "Subtopic", "Sentiment", "Response References", "Summary"]].to_markdown(index=False)
1466
 
1467
  #whole_conversation_metadata.append(whole_conversation_metadata_str)
1468
  whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
 
1481
  #system_prompt = system_prompt + normalised_simple_markdown_table
1482
 
1483
  # Prepare Gemini models before query
1484
+ if "gemini" in model_choice:
1485
  print("Using Gemini model:", model_choice)
1486
  model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
1487
+ elif "gemma" in model_choice:
1488
+ print("Using local Gemma model:", model_choice)
1489
  else:
1490
  print("Using AWS Bedrock model:", model_choice)
1491
 
 
1499
  if prompt3: formatted_prompt3 = prompt3.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt)
1500
  else: formatted_prompt3 = prompt3
1501
 
1502
+ if "gemma" in model_choice:
1503
  formatted_initial_table_prompt = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_initial_table_prompt + llama_cpp_suffix
1504
  formatted_prompt2 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt2 + llama_cpp_suffix
1505
  formatted_prompt3 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt3 + llama_cpp_suffix
 
1605
  # Set to a very high number so as not to mess with subsequent file processing by the user
1606
  #latest_batch_completed = 999
1607
 
1608
+ join_file_paths = []
1609
+
1610
  toc = time.perf_counter()
1611
  final_time = (toc - tic) + time_taken
1612
  out_time = f"Everything finished in {round(final_time,1)} seconds."
 
1637
  ## Reference table mapping response numbers to topics
1638
  existing_reference_df.to_csv(reference_table_out_path, index=None)
1639
  out_file_paths.append(reference_table_out_path)
1640
+ join_file_paths.append(reference_table_out_path)
1641
 
1642
  # Create final unique topics table from reference table to ensure consistent numbers
1643
  final_out_unique_topics_df = create_unique_table_df_from_reference_table(existing_reference_df)
 
1692
 
1693
  print("latest_batch_completed at end of batch iterations to return is", latest_batch_completed)
1694
 
1695
+ return unique_table_df_display_table_markdown, existing_topics_table, final_out_unique_topics_df, existing_reference_df, final_out_file_paths, final_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, final_out_file_paths, final_out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), final_out_file_paths, join_file_paths
 
 
 
 
1696
 
1697
 
1698
+ return unique_table_df_display_table_markdown, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths, join_file_paths
1699
 
1700
  def convert_reference_table_to_pivot_table(df:pd.DataFrame, basic_response_data:pd.DataFrame=pd.DataFrame()):
1701
 
 
2204
  whole_conversation_metadata = []
2205
 
2206
  # Prepare Gemini models before query
2207
+ if "gemini" in model_choice:
2208
  print("Using Gemini model:", model_choice)
2209
  model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
2210
  else:
 
2366
  if (model_choice == "gemma_2b_it_local") & (RUN_LOCAL_MODEL == "1"):
2367
  progress(0.1, "Loading in Gemma 2b model")
2368
  local_model, tokenizer = load_model()
2369
+ #print("Local model loaded:", local_model)
2370
 
2371
  summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
2372
  summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")
tools/prompts.py CHANGED
@@ -29,14 +29,16 @@ In the first column, write 'Not assessed'. In the second column, assign Subtopic
29
  allow_new_topics_prompt = """Create a new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response References', and 'Summary'.
30
  In the first and second columns, assign General Topics and Subtopics to Responses. Assign topics from the Topics table above only if they are very relevant to the text of the Response. Fill in the General Topic and Sentiment for the Subtopic if they do not already exist. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General Topic and Subtopic as specific as possible. The subtopic should never be blank or empty."""
31
 
 
 
32
  add_existing_topics_prompt = """Responses are shown in the following Response table:
33
  {response_table}
34
 
35
  Topics known to be relevant to this dataset are shown in the following Topics table:
36
  {topics}
37
 
38
- Your task is to create one new markdown table, assigning responses from the Response table to existing topics, or to create new topics if no existing topics are relevant.
39
- {topic_assignment}
40
  {sentiment_choices}.
41
  In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
42
  In the fifth column, write a short summary of the Subtopic based on relevant responses - highlight specific issues that appear.
@@ -46,7 +48,6 @@ New table:"""
46
 
47
  # Return only one table in markdown format containing all relevant topics. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
48
 
49
-
50
  summarise_topic_descriptions_system_prompt = system_prompt
51
 
52
  summarise_topic_descriptions_prompt = """Below is a table with number of paragraphs related to the data from the open text column:
@@ -57,6 +58,10 @@ Your task is to make a consolidated summary of the above text. {summary_format}.
57
 
58
  Summary:"""
59
 
 
 
 
 
60
 
61
  ## The following didn't work well in testing and so is not currently used
62
 
@@ -74,16 +79,16 @@ New Topics table:"""
74
  verify_titles_system_prompt = system_prompt
75
 
76
 
77
- verify_titles_prompt = """Response numbers alongside the Response text and assigned titles are shown in the table below:
78
  {response_table}
79
 
80
- The criteria for a suitable Title for these responses is that they should be readable, concise, and fully encapsulate the main subject of the response.
81
 
82
  Create a markdown table with four columns.
83
  The first column is 'Response References', and should contain just the response number under consideration.
84
- The second column is 'Is this a suitable title', answer the question with 'Yes' or 'No', with no other text.
85
  The third column is 'Explanation', give a short explanation for your response in the second column.
86
- The fourth column is 'Alternative title', suggest an alternative title for the response that meet the criteria stated above.
87
  Do not add any other text to your response.
88
 
89
  Output markdown table:"""
 
29
  allow_new_topics_prompt = """Create a new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response References', and 'Summary'.
30
  In the first and second columns, assign General Topics and Subtopics to Responses. Assign topics from the Topics table above only if they are very relevant to the text of the Response. Fill in the General Topic and Sentiment for the Subtopic if they do not already exist. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General Topic and Subtopic as specific as possible. The subtopic should never be blank or empty."""
31
 
32
+ force_single_topic_prompt = """ Wherever possible, assign a response to one single topic, unless there are multiple topics that are equally relevant."""
33
+
34
  add_existing_topics_prompt = """Responses are shown in the following Response table:
35
  {response_table}
36
 
37
  Topics known to be relevant to this dataset are shown in the following Topics table:
38
  {topics}
39
 
40
+ Your task is to create one new markdown table, assigning responses from the Response table to topics.
41
+ {topic_assignment}{force_single_topic}
42
  {sentiment_choices}.
43
  In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
44
  In the fifth column, write a short summary of the Subtopic based on relevant responses - highlight specific issues that appear.
 
48
 
49
  # Return only one table in markdown format containing all relevant topics. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
50
 
 
51
  summarise_topic_descriptions_system_prompt = system_prompt
52
 
53
  summarise_topic_descriptions_prompt = """Below is a table with number of paragraphs related to the data from the open text column:
 
58
 
59
  Summary:"""
60
 
61
+ single_para_summary_format_prompt = "Return a concise summary up to one paragraph long that summarises only the most important themes from the original text"
62
+
63
+ two_para_summary_format_prompt = "Return a summary up to two paragraphs long that includes as much detail as possible from the original text"
64
+
65
 
66
  ## The following didn't work well in testing and so is not currently used
67
 
 
79
  verify_titles_system_prompt = system_prompt
80
 
81
 
82
+ verify_titles_prompt = """Response numbers alongside the Response text and assigned descriptions are shown in the table below:
83
  {response_table}
84
 
85
+ The criteria for a suitable description for these responses is that they should be readable, concise, and fully encapsulate the main subject of the response.
86
 
87
  Create a markdown table with four columns.
88
  The first column is 'Response References', and should contain just the response number under consideration.
89
+ The second column is 'Is this a suitable description', answer the question with 'Yes' or 'No', with no other text.
90
  The third column is 'Explanation', give a short explanation for your response in the second column.
91
+ The fourth column is 'Alternative description', suggest an alternative description for the response that meet the criteria stated above.
92
  Do not add any other text to your response.
93
 
94
  Output markdown table:"""