Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
f2d85f1
1
Parent(s):
125e31b
Improved on implementation of group-based analysis. Now should be possible all the way to summarisation
Browse files- app.py +13 -38
- tools/dedup_summaries.py +41 -53
- tools/helper_functions.py +12 -11
- tools/llm_api_call.py +76 -38
- tools/prompts.py +1 -1
app.py
CHANGED
|
@@ -80,23 +80,18 @@ with app:
|
|
| 80 |
# UI LAYOUT
|
| 81 |
###
|
| 82 |
|
| 83 |
-
gr.Markdown(
|
| 84 |
-
"""# Large language model topic modelling
|
| 85 |
|
| 86 |
Extract topics and summarise outputs using Large Language Models (LLMs, a Gemma model if local, Gemini Flash/Pro, or Claude 3 through AWS Bedrock if running on AWS). The app will query the LLM with batches of responses to produce summary tables, which are then compared iteratively to output a table with the general topics, subtopics, topic sentiment, and relevant text rows related to them. The prompts are designed for topic modelling public consultations, but they can be adapted to different contexts (see the LLM settings tab to modify).
|
| 87 |
|
| 88 |
Instructions on use can be found in the README.md file. Try it out with this [dummy development consultation dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation), which you can also try with [zero-shot topics](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation/blob/main/example_zero_shot.csv), or this [dummy case notes dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_case_notes).
|
| 89 |
|
| 90 |
-
You can use an AWS Bedrock model (Claude 3, paid), or Gemini (a free API, but with strict limits for the Pro model). The use of Gemini models requires an API key. To set up your own Gemini API key, go [here](https://aistudio.google.com/app/u/1/plan_information).
|
| 91 |
|
| 92 |
NOTE: that **API calls to Gemini are not considered secure**, so please only submit redacted, non-sensitive tabular files to this source. Also, large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
|
| 93 |
|
| 94 |
with gr.Tab(label="Extract topics"):
|
| 95 |
-
gr.Markdown(
|
| 96 |
-
"""
|
| 97 |
-
### Choose a tabular data file (xlsx or csv) of open text to extract topics from.
|
| 98 |
-
"""
|
| 99 |
-
)
|
| 100 |
with gr.Row():
|
| 101 |
model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
|
| 102 |
in_api_key = gr.Textbox(value = GEMINI_API_KEY, label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
|
|
@@ -138,10 +133,7 @@ with app:
|
|
| 138 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
| 139 |
|
| 140 |
with gr.Tab(label="Modify, deduplicate, and summarise topic outputs"):
|
| 141 |
-
gr.Markdown(
|
| 142 |
-
"""
|
| 143 |
-
Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to modify topics, deduplicate topics, or summarise the outputs. If you want pivot table outputs, please load in the original data file along with the selected open text column on the first tab before deduplicating or summarising.
|
| 144 |
-
""")
|
| 145 |
|
| 146 |
with gr.Accordion("Modify existing topics", open = False):
|
| 147 |
modification_input_files = gr.File(height=file_input_height, label="Upload files to modify topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
|
@@ -184,19 +176,13 @@ with app:
|
|
| 184 |
overall_summarised_output_markdown = gr.Markdown(value="### Overall summary will appear here", show_copy_button=True)
|
| 185 |
|
| 186 |
with gr.Tab(label="Topic table viewer"):
|
| 187 |
-
gr.Markdown(
|
| 188 |
-
"""
|
| 189 |
-
### View a 'unique_topic_table' csv file in markdown format.
|
| 190 |
-
""")
|
| 191 |
|
| 192 |
in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
|
| 193 |
view_table_markdown = gr.Markdown(value = "", label="View table", show_copy_button=True)
|
| 194 |
|
| 195 |
with gr.Tab(label="Continue unfinished topic extraction"):
|
| 196 |
-
gr.Markdown(
|
| 197 |
-
"""
|
| 198 |
-
### Load in output files from a previous topic extraction process and continue topic extraction with new data.
|
| 199 |
-
""")
|
| 200 |
|
| 201 |
with gr.Accordion("Upload reference data file and unique data files", open = True):
|
| 202 |
in_previous_data_files = gr.File(height=file_input_height, label="Choose output csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
|
@@ -204,11 +190,7 @@ with app:
|
|
| 204 |
continue_previous_data_files_btn = gr.Button(value="Continue previous topic extraction", variant="primary")
|
| 205 |
|
| 206 |
with gr.Tab(label="Verify descriptions"):
|
| 207 |
-
gr.Markdown(
|
| 208 |
-
"""
|
| 209 |
-
### Choose a tabular data file (xlsx or csv) with titles and original text to verify descriptions for.
|
| 210 |
-
"""
|
| 211 |
-
)
|
| 212 |
with gr.Row():
|
| 213 |
verify_model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
|
| 214 |
verify_in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
|
|
@@ -227,10 +209,7 @@ with app:
|
|
| 227 |
verify_modification_input_files_placeholder = gr.File(height=file_input_height, label="Placeholder for files to avoid errors", visible=False)
|
| 228 |
|
| 229 |
with gr.Tab(label="Topic extraction settings"):
|
| 230 |
-
gr.Markdown(
|
| 231 |
-
"""
|
| 232 |
-
Define settings that affect large language model output.
|
| 233 |
-
""")
|
| 234 |
with gr.Accordion("Settings for LLM generation", open = True):
|
| 235 |
temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Choose LLM temperature setting")
|
| 236 |
batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = BATCH_SIZE_DEFAULT, precision=0, minimum=1, maximum=100)
|
|
@@ -296,32 +275,28 @@ with app:
|
|
| 296 |
success(fn=wrapper_extract_topics_per_column_value,
|
| 297 |
inputs=[in_group_col,
|
| 298 |
in_data_files,
|
| 299 |
-
|
| 300 |
file_data_state,
|
| 301 |
master_topic_df_state,
|
| 302 |
master_reference_df_state,
|
| 303 |
master_unique_topics_df_state,
|
| 304 |
display_topic_table_markdown,
|
| 305 |
reference_data_file_name_textbox,
|
| 306 |
-
|
| 307 |
total_number_of_batches,
|
| 308 |
in_api_key,
|
| 309 |
temperature_slide,
|
| 310 |
in_colnames,
|
| 311 |
model_choice,
|
| 312 |
candidate_topics,
|
| 313 |
-
|
| 314 |
first_loop_state,
|
| 315 |
conversation_metadata_textbox,
|
| 316 |
latest_batch_completed,
|
| 317 |
-
estimated_time_taken_number,
|
| 318 |
-
|
| 319 |
initial_table_prompt_textbox,
|
| 320 |
prompt_2_textbox,
|
| 321 |
prompt_3_textbox,
|
| 322 |
system_prompt_textbox,
|
| 323 |
-
add_to_existing_topics_system_prompt_textbox,
|
| 324 |
-
|
| 325 |
number_of_prompts,
|
| 326 |
batch_size_number,
|
| 327 |
context_textbox,
|
|
@@ -371,14 +346,14 @@ with app:
|
|
| 371 |
# When button pressed, summarise previous data
|
| 372 |
summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, overall_summarisation_input_files]).\
|
| 373 |
success(load_in_previous_data_files, inputs=[summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
|
| 374 |
-
success(sample_reference_table_summaries, inputs=[master_reference_df_state,
|
| 375 |
success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], api_name="summarise_topics")
|
| 376 |
|
| 377 |
latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], scroll_to_output=True)
|
| 378 |
|
| 379 |
# SUMMARISE WHOLE TABLE PAGE
|
| 380 |
overall_summarise_previous_data_btn.click(load_in_previous_data_files, inputs=[overall_summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
|
| 381 |
-
success(overall_summary, inputs=[master_unique_topics_df_state, model_choice, in_api_key, temperature_slide,
|
| 382 |
|
| 383 |
###
|
| 384 |
# CONTINUE PREVIOUS TOPIC EXTRACTION PAGE
|
|
|
|
| 80 |
# UI LAYOUT
|
| 81 |
###
|
| 82 |
|
| 83 |
+
gr.Markdown("""# Large language model topic modelling
|
|
|
|
| 84 |
|
| 85 |
Extract topics and summarise outputs using Large Language Models (LLMs, a Gemma model if local, Gemini Flash/Pro, or Claude 3 through AWS Bedrock if running on AWS). The app will query the LLM with batches of responses to produce summary tables, which are then compared iteratively to output a table with the general topics, subtopics, topic sentiment, and relevant text rows related to them. The prompts are designed for topic modelling public consultations, but they can be adapted to different contexts (see the LLM settings tab to modify).
|
| 86 |
|
| 87 |
Instructions on use can be found in the README.md file. Try it out with this [dummy development consultation dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation), which you can also try with [zero-shot topics](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation/blob/main/example_zero_shot.csv), or this [dummy case notes dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_case_notes).
|
| 88 |
|
| 89 |
+
You can use an AWS Bedrock model (Claude 3, paid), or Gemini (a free API, but with strict limits for the Pro model). The use of Gemini models requires an API key. To set up your own Gemini API key, go [here](https://aistudio.google.com/app/u/1/plan_information).
|
| 90 |
|
| 91 |
NOTE: that **API calls to Gemini are not considered secure**, so please only submit redacted, non-sensitive tabular files to this source. Also, large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
|
| 92 |
|
| 93 |
with gr.Tab(label="Extract topics"):
|
| 94 |
+
gr.Markdown("""### Choose a tabular data file (xlsx or csv) of open text to extract topics from.""")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
with gr.Row():
|
| 96 |
model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
|
| 97 |
in_api_key = gr.Textbox(value = GEMINI_API_KEY, label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
|
|
|
|
| 133 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
| 134 |
|
| 135 |
with gr.Tab(label="Modify, deduplicate, and summarise topic outputs"):
|
| 136 |
+
gr.Markdown("""Load in previously completed Extract Topics output files ('reference_table', and 'unique_topics' files) to modify topics, deduplicate topics, or summarise the outputs. If you want pivot table outputs, please load in the original data file along with the selected open text column on the first tab before deduplicating or summarising.""")
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
with gr.Accordion("Modify existing topics", open = False):
|
| 139 |
modification_input_files = gr.File(height=file_input_height, label="Upload files to modify topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
|
|
|
| 176 |
overall_summarised_output_markdown = gr.Markdown(value="### Overall summary will appear here", show_copy_button=True)
|
| 177 |
|
| 178 |
with gr.Tab(label="Topic table viewer"):
|
| 179 |
+
gr.Markdown("""### View a 'unique_topic_table' csv file in markdown format.""")
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
|
| 182 |
view_table_markdown = gr.Markdown(value = "", label="View table", show_copy_button=True)
|
| 183 |
|
| 184 |
with gr.Tab(label="Continue unfinished topic extraction"):
|
| 185 |
+
gr.Markdown("""### Load in output files from a previous topic extraction process and continue topic extraction with new data.""")
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
with gr.Accordion("Upload reference data file and unique data files", open = True):
|
| 188 |
in_previous_data_files = gr.File(height=file_input_height, label="Choose output csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
|
|
|
| 190 |
continue_previous_data_files_btn = gr.Button(value="Continue previous topic extraction", variant="primary")
|
| 191 |
|
| 192 |
with gr.Tab(label="Verify descriptions"):
|
| 193 |
+
gr.Markdown("""### Choose a tabular data file (xlsx or csv) with titles and original text to verify descriptions for.""")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
with gr.Row():
|
| 195 |
verify_model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
|
| 196 |
verify_in_api_key = gr.Textbox(value = "", label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
|
|
|
|
| 209 |
verify_modification_input_files_placeholder = gr.File(height=file_input_height, label="Placeholder for files to avoid errors", visible=False)
|
| 210 |
|
| 211 |
with gr.Tab(label="Topic extraction settings"):
|
| 212 |
+
gr.Markdown("""Define settings that affect large language model output.""")
|
|
|
|
|
|
|
|
|
|
| 213 |
with gr.Accordion("Settings for LLM generation", open = True):
|
| 214 |
temperature_slide = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, label="Choose LLM temperature setting")
|
| 215 |
batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = BATCH_SIZE_DEFAULT, precision=0, minimum=1, maximum=100)
|
|
|
|
| 275 |
success(fn=wrapper_extract_topics_per_column_value,
|
| 276 |
inputs=[in_group_col,
|
| 277 |
in_data_files,
|
|
|
|
| 278 |
file_data_state,
|
| 279 |
master_topic_df_state,
|
| 280 |
master_reference_df_state,
|
| 281 |
master_unique_topics_df_state,
|
| 282 |
display_topic_table_markdown,
|
| 283 |
reference_data_file_name_textbox,
|
|
|
|
| 284 |
total_number_of_batches,
|
| 285 |
in_api_key,
|
| 286 |
temperature_slide,
|
| 287 |
in_colnames,
|
| 288 |
model_choice,
|
| 289 |
candidate_topics,
|
|
|
|
| 290 |
first_loop_state,
|
| 291 |
conversation_metadata_textbox,
|
| 292 |
latest_batch_completed,
|
| 293 |
+
estimated_time_taken_number,
|
|
|
|
| 294 |
initial_table_prompt_textbox,
|
| 295 |
prompt_2_textbox,
|
| 296 |
prompt_3_textbox,
|
| 297 |
system_prompt_textbox,
|
| 298 |
+
add_to_existing_topics_system_prompt_textbox,
|
| 299 |
+
add_to_existing_topics_prompt_textbox,
|
| 300 |
number_of_prompts,
|
| 301 |
batch_size_number,
|
| 302 |
context_textbox,
|
|
|
|
| 346 |
# When button pressed, summarise previous data
|
| 347 |
summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, overall_summarisation_input_files]).\
|
| 348 |
success(load_in_previous_data_files, inputs=[summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
|
| 349 |
+
success(sample_reference_table_summaries, inputs=[master_reference_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown], api_name="sample_summaries").\
|
| 350 |
success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], api_name="summarise_topics")
|
| 351 |
|
| 352 |
latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], scroll_to_output=True)
|
| 353 |
|
| 354 |
# SUMMARISE WHOLE TABLE PAGE
|
| 355 |
overall_summarise_previous_data_btn.click(load_in_previous_data_files, inputs=[overall_summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
|
| 356 |
+
success(overall_summary, inputs=[master_unique_topics_df_state, model_choice, in_api_key, temperature_slide, unique_topics_table_file_name_textbox, summarised_outputs_list, output_folder_state], outputs=[overall_summary_output_files, overall_summarised_output_markdown], scroll_to_output=True, api_name="overall_summary")
|
| 357 |
|
| 358 |
###
|
| 359 |
# CONTINUE PREVIOUS TOPIC EXTRACTION PAGE
|
tools/dedup_summaries.py
CHANGED
|
@@ -9,7 +9,7 @@ from tqdm import tqdm
|
|
| 9 |
|
| 10 |
from tools.prompts import summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, system_prompt, summarise_everything_prompt, comprehensive_summary_format_prompt
|
| 11 |
from tools.llm_funcs import construct_gemini_generative_model, process_requests, ResponseObject, load_model
|
| 12 |
-
from tools.helper_functions import create_topic_summary_df_from_reference_table, load_in_data_file, get_basic_response_data, convert_reference_table_to_pivot_table, wrap_text
|
| 13 |
from tools.config import OUTPUT_FOLDER, RUN_LOCAL_MODEL, MAX_COMMENT_CHARS, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED
|
| 14 |
|
| 15 |
max_tokens = MAX_TOKENS
|
|
@@ -158,17 +158,17 @@ def deduplicate_topics(reference_df:pd.DataFrame,
|
|
| 158 |
print(out_message)
|
| 159 |
#raise Exception(out_message)
|
| 160 |
|
| 161 |
-
|
| 162 |
-
|
| 163 |
# Run through this x times to try to get all duplicate topics
|
| 164 |
if deduplicate_topics == "Yes":
|
|
|
|
|
|
|
| 165 |
for i in range(0, 8):
|
| 166 |
if merge_sentiment == "No":
|
| 167 |
if merge_general_topics == "No":
|
| 168 |
reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
|
| 169 |
reference_df_unique = reference_df.drop_duplicates("old_category")
|
| 170 |
|
| 171 |
-
deduplicated_topic_map_df = reference_df_unique.groupby(["General Topic", "Sentiment"]).apply(
|
| 172 |
lambda group: deduplicate_categories(
|
| 173 |
group["Subtopic"],
|
| 174 |
group["Sentiment"],
|
|
@@ -233,8 +233,6 @@ def deduplicate_topics(reference_df:pd.DataFrame,
|
|
| 233 |
# Remove rows where 'deduplicated_category' is blank or NaN
|
| 234 |
deduplicated_topic_map_df = deduplicated_topic_map_df.loc[(deduplicated_topic_map_df['deduplicated_category'].str.strip() != '') & ~(deduplicated_topic_map_df['deduplicated_category'].isnull()), ['old_category','deduplicated_category', 'match_score']]
|
| 235 |
|
| 236 |
-
#deduplicated_topic_map_df.to_csv(output_folder + "deduplicated_topic_map_df_" + str(i) + ".csv", index=None)
|
| 237 |
-
|
| 238 |
reference_df = reference_df.merge(deduplicated_topic_map_df, on="old_category", how="left")
|
| 239 |
|
| 240 |
reference_df.rename(columns={"Subtopic": "Subtopic_old", "Sentiment": "Sentiment_old"}, inplace=True)
|
|
@@ -246,14 +244,9 @@ def deduplicate_topics(reference_df:pd.DataFrame,
|
|
| 246 |
reference_df["Subtopic"] = reference_df["deduplicated_category"].combine_first(reference_df["Subtopic_old"])
|
| 247 |
reference_df["Sentiment"] = reference_df["Sentiment"].combine_first(reference_df["Sentiment_old"])
|
| 248 |
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
reference_df = reference_df[["Response References", "General Topic", "Subtopic", "Sentiment", "Summary", "Start row of group"]]
|
| 253 |
-
|
| 254 |
-
#reference_df["General Topic"] = reference_df["General Topic"].str.lower().str.capitalize()
|
| 255 |
-
#reference_df["Subtopic"] = reference_df["Subtopic"].str.lower().str.capitalize()
|
| 256 |
-
#reference_df["Sentiment"] = reference_df["Sentiment"].str.lower().str.capitalize()
|
| 257 |
|
| 258 |
if merge_general_topics == "Yes":
|
| 259 |
# Replace General topic names for each Subtopic with that for the Subtopic with the most responses
|
|
@@ -285,8 +278,10 @@ def deduplicate_topics(reference_df:pd.DataFrame,
|
|
| 285 |
# Clean up the DataFrame by dropping the UniqueCount column
|
| 286 |
reference_df.drop(columns=['UniqueCount'], inplace=True)
|
| 287 |
|
| 288 |
-
reference_df
|
| 289 |
-
|
|
|
|
|
|
|
| 290 |
# Update reference summary column with all summaries
|
| 291 |
reference_df["Summary"] = reference_df.groupby(
|
| 292 |
["Response References", "General Topic", "Subtopic", "Sentiment"]
|
|
@@ -301,19 +296,17 @@ def deduplicate_topics(reference_df:pd.DataFrame,
|
|
| 301 |
# Drop duplicates in the reference table - each comment should only have the same topic referred to once
|
| 302 |
reference_df.drop_duplicates(['Response References', 'General Topic', 'Subtopic', 'Sentiment'], inplace=True)
|
| 303 |
|
| 304 |
-
|
| 305 |
# Remake topic_summary_df based on new reference_df
|
| 306 |
topic_summary_df = create_topic_summary_df_from_reference_table(reference_df)
|
| 307 |
|
| 308 |
# Then merge the topic numbers back to the original dataframe
|
| 309 |
reference_df = reference_df.merge(
|
| 310 |
-
topic_summary_df[['General Topic', 'Subtopic', 'Sentiment', 'Topic_number']],
|
| 311 |
-
on=['General Topic', 'Subtopic', 'Sentiment'],
|
| 312 |
how='left'
|
| 313 |
)
|
| 314 |
|
| 315 |
-
else:
|
| 316 |
-
print("Topics have not beeen deduplicated")
|
| 317 |
|
| 318 |
|
| 319 |
if not file_data.empty:
|
|
@@ -337,13 +330,11 @@ def deduplicate_topics(reference_df:pd.DataFrame,
|
|
| 337 |
|
| 338 |
# Outputs for markdown table output
|
| 339 |
topic_summary_df_revised_display = topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
|
| 340 |
-
|
| 341 |
deduplicated_unique_table_markdown = topic_summary_df_revised_display.to_markdown(index=False)
|
| 342 |
|
| 343 |
return reference_df, topic_summary_df, output_files, log_output_files, deduplicated_unique_table_markdown
|
| 344 |
|
| 345 |
def sample_reference_table_summaries(reference_df:pd.DataFrame,
|
| 346 |
-
topic_summary_df:pd.DataFrame,
|
| 347 |
random_seed:int,
|
| 348 |
no_of_sampled_summaries:int=150):
|
| 349 |
|
|
@@ -354,7 +345,10 @@ def sample_reference_table_summaries(reference_df:pd.DataFrame,
|
|
| 354 |
all_summaries = pd.DataFrame()
|
| 355 |
output_files = []
|
| 356 |
|
| 357 |
-
|
|
|
|
|
|
|
|
|
|
| 358 |
|
| 359 |
if 'Revised summary' in reference_df.columns:
|
| 360 |
out_message = "Summary has already been created for this file"
|
|
@@ -389,7 +383,7 @@ def sample_reference_table_summaries(reference_df:pd.DataFrame,
|
|
| 389 |
|
| 390 |
summarised_references_markdown = summarised_references.to_markdown(index=False)
|
| 391 |
|
| 392 |
-
return summarised_references, summarised_references_markdown
|
| 393 |
|
| 394 |
def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:float, formatted_summary_prompt:str, summarise_topic_descriptions_system_prompt:str, local_model=[]):
|
| 395 |
conversation_history = []
|
|
@@ -453,7 +447,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
| 453 |
out_metadata = []
|
| 454 |
local_model = []
|
| 455 |
summarised_output_markdown = ""
|
| 456 |
-
output_files = []
|
| 457 |
|
| 458 |
# Check for data for summarisations
|
| 459 |
if not topic_summary_df.empty and not reference_table_df.empty:
|
|
@@ -475,11 +469,12 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
| 475 |
out_message = "No file data found, pivot table output will not be created."
|
| 476 |
print(out_message)
|
| 477 |
raise Exception(out_message)
|
|
|
|
|
|
|
|
|
|
| 478 |
|
| 479 |
-
try:
|
| 480 |
-
|
| 481 |
-
except:
|
| 482 |
-
all_summaries = summarised_references["Revised summary"].tolist()
|
| 483 |
|
| 484 |
length_all_summaries = len(all_summaries)
|
| 485 |
|
|
@@ -488,16 +483,14 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
| 488 |
print("All summaries completed. Creating outputs.")
|
| 489 |
|
| 490 |
model_choice_clean = model_name_map[model_choice]
|
| 491 |
-
file_name = re.search(r'(.*?)(?:_batch_|_col_)', table_file_name).group(1) if re.search(r'(.*?)(?:_batch_|_col_)', table_file_name) else table_file_name
|
| 492 |
latest_batch_completed = int(re.search(r'batch_(\d+)_', table_file_name).group(1)) if 'batch_' in table_file_name else ""
|
| 493 |
batch_size_number = int(re.search(r'size_(\d+)_', table_file_name).group(1)) if 'size_' in table_file_name else ""
|
| 494 |
in_column_cleaned = re.search(r'col_(.*?)_reference', table_file_name).group(1) if 'col_' in table_file_name else ""
|
| 495 |
|
| 496 |
# Save outputs for each batch. If master file created, label file as master
|
| 497 |
-
if latest_batch_completed:
|
| 498 |
-
|
| 499 |
-
else:
|
| 500 |
-
batch_file_path_details = f"{file_name}_col_{in_column_cleaned}"
|
| 501 |
|
| 502 |
summarised_references["Revised summary"] = summarised_outputs
|
| 503 |
|
|
@@ -511,7 +504,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
| 511 |
# If no new summary is available, keep the original
|
| 512 |
topic_summary_df_revised["Revised summary"] = topic_summary_df_revised["Revised summary"].combine_first(topic_summary_df_revised["Summary"])
|
| 513 |
|
| 514 |
-
topic_summary_df_revised = topic_summary_df_revised[["General Topic", "Subtopic", "Sentiment", "Number of responses", "Revised summary"]]
|
| 515 |
|
| 516 |
# Replace all instances of 'Rows X to Y:' that remain on some topics that have not had additional summaries
|
| 517 |
topic_summary_df_revised["Revised summary"] = topic_summary_df_revised["Revised summary"].str.replace("^Rows\s+\d+\s+to\s+\d+:\s*", "", regex=True)
|
|
@@ -545,7 +538,6 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
| 545 |
|
| 546 |
###
|
| 547 |
topic_summary_df_revised_display = topic_summary_df_revised.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
|
| 548 |
-
|
| 549 |
summarised_output_markdown = topic_summary_df_revised_display.to_markdown(index=False)
|
| 550 |
|
| 551 |
# Ensure same file name not returned twice
|
|
@@ -560,16 +552,14 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
| 560 |
#print("Last summary number:", length_all_summaries)
|
| 561 |
|
| 562 |
if (model_choice == CHOSEN_LOCAL_MODEL_TYPE) & (RUN_LOCAL_MODEL == "1"):
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
#print("Local model loaded:", local_model)
|
| 566 |
|
| 567 |
summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
|
| 568 |
summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")
|
| 569 |
|
| 570 |
if do_summaries == "Yes":
|
| 571 |
for summary_no in summary_loop:
|
| 572 |
-
|
| 573 |
print("Current summary number is:", summary_no)
|
| 574 |
|
| 575 |
summary_text = all_summaries[summary_no]
|
|
@@ -631,8 +621,9 @@ def overall_summary(topic_summary_df:pd.DataFrame,
|
|
| 631 |
latest_summary_completed = 0
|
| 632 |
output_files = []
|
| 633 |
|
| 634 |
-
model_choice_clean = model_name_map[model_choice]
|
| 635 |
-
|
|
|
|
| 636 |
latest_batch_completed = int(re.search(r'batch_(\d+)_', table_file_name).group(1)) if 'batch_' in table_file_name else ""
|
| 637 |
batch_size_number = int(re.search(r'size_(\d+)_', table_file_name).group(1)) if 'size_' in table_file_name else ""
|
| 638 |
in_column_cleaned = re.search(r'col_(.*?)_reference', table_file_name).group(1) if 'col_' in table_file_name else ""
|
|
@@ -659,14 +650,10 @@ def overall_summary(topic_summary_df:pd.DataFrame,
|
|
| 659 |
if do_summaries == "Yes":
|
| 660 |
for summary_no in summary_loop:
|
| 661 |
|
| 662 |
-
print("Current summary number is:", summary_no)
|
| 663 |
-
|
| 664 |
summary_text = topic_summary_df.to_markdown(index=False)
|
| 665 |
-
|
| 666 |
formatted_summary_prompt = [summarise_everything_prompt.format(topic_summary_table=summary_text, summary_format=comprehensive_summary_format_prompt)]
|
| 667 |
|
| 668 |
-
#print("formatted_summary_prompt:", formatted_summary_prompt)
|
| 669 |
-
|
| 670 |
try:
|
| 671 |
response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, summarise_topic_descriptions_system_prompt, local_model)
|
| 672 |
summarised_output = response
|
|
@@ -687,16 +674,17 @@ def overall_summary(topic_summary_df:pd.DataFrame,
|
|
| 687 |
toc = time.perf_counter()
|
| 688 |
time_taken = tic - toc
|
| 689 |
|
| 690 |
-
# Define the output file path for the
|
| 691 |
-
|
|
|
|
| 692 |
|
| 693 |
# Write the formatted prompt to the specified file
|
| 694 |
try:
|
| 695 |
-
with open(
|
| 696 |
f.write(summarised_output)
|
| 697 |
-
output_files.append(
|
| 698 |
except Exception as e:
|
| 699 |
-
print(f"Error writing prompt to file {
|
| 700 |
|
| 701 |
output_files = list(set(output_files))
|
| 702 |
|
|
|
|
| 9 |
|
| 10 |
from tools.prompts import summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, system_prompt, summarise_everything_prompt, comprehensive_summary_format_prompt
|
| 11 |
from tools.llm_funcs import construct_gemini_generative_model, process_requests, ResponseObject, load_model
|
| 12 |
+
from tools.helper_functions import create_topic_summary_df_from_reference_table, load_in_data_file, get_basic_response_data, convert_reference_table_to_pivot_table, wrap_text, clean_column_name
|
| 13 |
from tools.config import OUTPUT_FOLDER, RUN_LOCAL_MODEL, MAX_COMMENT_CHARS, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED
|
| 14 |
|
| 15 |
max_tokens = MAX_TOKENS
|
|
|
|
| 158 |
print(out_message)
|
| 159 |
#raise Exception(out_message)
|
| 160 |
|
|
|
|
|
|
|
| 161 |
# Run through this x times to try to get all duplicate topics
|
| 162 |
if deduplicate_topics == "Yes":
|
| 163 |
+
if "Group" not in reference_df.columns:
|
| 164 |
+
reference_df["Group"] = "All"
|
| 165 |
for i in range(0, 8):
|
| 166 |
if merge_sentiment == "No":
|
| 167 |
if merge_general_topics == "No":
|
| 168 |
reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
|
| 169 |
reference_df_unique = reference_df.drop_duplicates("old_category")
|
| 170 |
|
| 171 |
+
deduplicated_topic_map_df = reference_df_unique.groupby(["General Topic", "Sentiment", "Group"]).apply(
|
| 172 |
lambda group: deduplicate_categories(
|
| 173 |
group["Subtopic"],
|
| 174 |
group["Sentiment"],
|
|
|
|
| 233 |
# Remove rows where 'deduplicated_category' is blank or NaN
|
| 234 |
deduplicated_topic_map_df = deduplicated_topic_map_df.loc[(deduplicated_topic_map_df['deduplicated_category'].str.strip() != '') & ~(deduplicated_topic_map_df['deduplicated_category'].isnull()), ['old_category','deduplicated_category', 'match_score']]
|
| 235 |
|
|
|
|
|
|
|
| 236 |
reference_df = reference_df.merge(deduplicated_topic_map_df, on="old_category", how="left")
|
| 237 |
|
| 238 |
reference_df.rename(columns={"Subtopic": "Subtopic_old", "Sentiment": "Sentiment_old"}, inplace=True)
|
|
|
|
| 244 |
reference_df["Subtopic"] = reference_df["deduplicated_category"].combine_first(reference_df["Subtopic_old"])
|
| 245 |
reference_df["Sentiment"] = reference_df["Sentiment"].combine_first(reference_df["Sentiment_old"])
|
| 246 |
|
| 247 |
+
#reference_df.drop(['old_category', 'deduplicated_category', "Subtopic_old", "Sentiment_old"], axis=1, inplace=True, errors="ignore")
|
| 248 |
+
#print("reference_df:", reference_df)
|
| 249 |
+
reference_df = reference_df[["Response References", "General Topic", "Subtopic", "Sentiment", "Summary", "Start row of group", "Group"]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
|
| 251 |
if merge_general_topics == "Yes":
|
| 252 |
# Replace General topic names for each Subtopic with that for the Subtopic with the most responses
|
|
|
|
| 278 |
# Clean up the DataFrame by dropping the UniqueCount column
|
| 279 |
reference_df.drop(columns=['UniqueCount'], inplace=True)
|
| 280 |
|
| 281 |
+
#print("reference_df:", reference_df)
|
| 282 |
+
reference_df = reference_df[["Response References", "General Topic", "Subtopic", "Sentiment", "Summary", "Start row of group", "Group"]]
|
| 283 |
+
#reference_df.drop(['old_category', 'deduplicated_category', "Subtopic_old", "Sentiment_old"], axis=1, inplace=True, errors="ignore")
|
| 284 |
+
|
| 285 |
# Update reference summary column with all summaries
|
| 286 |
reference_df["Summary"] = reference_df.groupby(
|
| 287 |
["Response References", "General Topic", "Subtopic", "Sentiment"]
|
|
|
|
| 296 |
# Drop duplicates in the reference table - each comment should only have the same topic referred to once
|
| 297 |
reference_df.drop_duplicates(['Response References', 'General Topic', 'Subtopic', 'Sentiment'], inplace=True)
|
| 298 |
|
|
|
|
| 299 |
# Remake topic_summary_df based on new reference_df
|
| 300 |
topic_summary_df = create_topic_summary_df_from_reference_table(reference_df)
|
| 301 |
|
| 302 |
# Then merge the topic numbers back to the original dataframe
|
| 303 |
reference_df = reference_df.merge(
|
| 304 |
+
topic_summary_df[['General Topic', 'Subtopic', 'Sentiment', 'Group', 'Topic_number']],
|
| 305 |
+
on=['General Topic', 'Subtopic', 'Sentiment', 'Group'],
|
| 306 |
how='left'
|
| 307 |
)
|
| 308 |
|
| 309 |
+
else: print("Topics have not beeen deduplicated")
|
|
|
|
| 310 |
|
| 311 |
|
| 312 |
if not file_data.empty:
|
|
|
|
| 330 |
|
| 331 |
# Outputs for markdown table output
|
| 332 |
topic_summary_df_revised_display = topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
|
|
|
|
| 333 |
deduplicated_unique_table_markdown = topic_summary_df_revised_display.to_markdown(index=False)
|
| 334 |
|
| 335 |
return reference_df, topic_summary_df, output_files, log_output_files, deduplicated_unique_table_markdown
|
| 336 |
|
| 337 |
def sample_reference_table_summaries(reference_df:pd.DataFrame,
|
|
|
|
| 338 |
random_seed:int,
|
| 339 |
no_of_sampled_summaries:int=150):
|
| 340 |
|
|
|
|
| 345 |
all_summaries = pd.DataFrame()
|
| 346 |
output_files = []
|
| 347 |
|
| 348 |
+
if "Group" not in reference_df.columns:
|
| 349 |
+
reference_df["Group"] = "All"
|
| 350 |
+
|
| 351 |
+
reference_df_grouped = reference_df.groupby(["General Topic", "Subtopic", "Sentiment", "Group"])
|
| 352 |
|
| 353 |
if 'Revised summary' in reference_df.columns:
|
| 354 |
out_message = "Summary has already been created for this file"
|
|
|
|
| 383 |
|
| 384 |
summarised_references_markdown = summarised_references.to_markdown(index=False)
|
| 385 |
|
| 386 |
+
return summarised_references, summarised_references_markdown#, reference_df, topic_summary_df
|
| 387 |
|
| 388 |
def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:float, formatted_summary_prompt:str, summarise_topic_descriptions_system_prompt:str, local_model=[]):
|
| 389 |
conversation_history = []
|
|
|
|
| 447 |
out_metadata = []
|
| 448 |
local_model = []
|
| 449 |
summarised_output_markdown = ""
|
| 450 |
+
output_files = []
|
| 451 |
|
| 452 |
# Check for data for summarisations
|
| 453 |
if not topic_summary_df.empty and not reference_table_df.empty:
|
|
|
|
| 469 |
out_message = "No file data found, pivot table output will not be created."
|
| 470 |
print(out_message)
|
| 471 |
raise Exception(out_message)
|
| 472 |
+
|
| 473 |
+
if "Group" not in reference_table_df.columns: reference_table_df["Group"] = "All"
|
| 474 |
+
if "Group" not in topic_summary_df.columns: topic_summary_df["Group"] = "All"
|
| 475 |
|
| 476 |
+
try: all_summaries = summarised_references["Summary"].tolist()
|
| 477 |
+
except: all_summaries = summarised_references["Revised summary"].tolist()
|
|
|
|
|
|
|
| 478 |
|
| 479 |
length_all_summaries = len(all_summaries)
|
| 480 |
|
|
|
|
| 483 |
print("All summaries completed. Creating outputs.")
|
| 484 |
|
| 485 |
model_choice_clean = model_name_map[model_choice]
|
| 486 |
+
file_name = re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name).group(1) if re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name) else table_file_name
|
| 487 |
latest_batch_completed = int(re.search(r'batch_(\d+)_', table_file_name).group(1)) if 'batch_' in table_file_name else ""
|
| 488 |
batch_size_number = int(re.search(r'size_(\d+)_', table_file_name).group(1)) if 'size_' in table_file_name else ""
|
| 489 |
in_column_cleaned = re.search(r'col_(.*?)_reference', table_file_name).group(1) if 'col_' in table_file_name else ""
|
| 490 |
|
| 491 |
# Save outputs for each batch. If master file created, label file as master
|
| 492 |
+
if latest_batch_completed: batch_file_path_details = f"{file_name}_batch_{latest_batch_completed}_size_{batch_size_number}_col_{in_column_cleaned}"
|
| 493 |
+
else: batch_file_path_details = f"{file_name}_col_{in_column_cleaned}"
|
|
|
|
|
|
|
| 494 |
|
| 495 |
summarised_references["Revised summary"] = summarised_outputs
|
| 496 |
|
|
|
|
| 504 |
# If no new summary is available, keep the original
|
| 505 |
topic_summary_df_revised["Revised summary"] = topic_summary_df_revised["Revised summary"].combine_first(topic_summary_df_revised["Summary"])
|
| 506 |
|
| 507 |
+
topic_summary_df_revised = topic_summary_df_revised[["General Topic", "Subtopic", "Sentiment", "Group", "Number of responses", "Revised summary"]]
|
| 508 |
|
| 509 |
# Replace all instances of 'Rows X to Y:' that remain on some topics that have not had additional summaries
|
| 510 |
topic_summary_df_revised["Revised summary"] = topic_summary_df_revised["Revised summary"].str.replace("^Rows\s+\d+\s+to\s+\d+:\s*", "", regex=True)
|
|
|
|
| 538 |
|
| 539 |
###
|
| 540 |
topic_summary_df_revised_display = topic_summary_df_revised.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
|
|
|
|
| 541 |
summarised_output_markdown = topic_summary_df_revised_display.to_markdown(index=False)
|
| 542 |
|
| 543 |
# Ensure same file name not returned twice
|
|
|
|
| 552 |
#print("Last summary number:", length_all_summaries)
|
| 553 |
|
| 554 |
if (model_choice == CHOSEN_LOCAL_MODEL_TYPE) & (RUN_LOCAL_MODEL == "1"):
|
| 555 |
+
progress(0.1, f"Loading in local model: {CHOSEN_LOCAL_MODEL_TYPE}")
|
| 556 |
+
local_model, tokenizer = load_model(local_model_type=CHOSEN_LOCAL_MODEL_TYPE, repo_id=LOCAL_REPO_ID, model_filename=LOCAL_MODEL_FILE, model_dir=LOCAL_MODEL_FOLDER)
|
|
|
|
| 557 |
|
| 558 |
summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
|
| 559 |
summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")
|
| 560 |
|
| 561 |
if do_summaries == "Yes":
|
| 562 |
for summary_no in summary_loop:
|
|
|
|
| 563 |
print("Current summary number is:", summary_no)
|
| 564 |
|
| 565 |
summary_text = all_summaries[summary_no]
|
|
|
|
| 621 |
latest_summary_completed = 0
|
| 622 |
output_files = []
|
| 623 |
|
| 624 |
+
model_choice_clean = model_name_map[model_choice]
|
| 625 |
+
model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
|
| 626 |
+
file_name = re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name).group(1) if re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name) else table_file_name
|
| 627 |
latest_batch_completed = int(re.search(r'batch_(\d+)_', table_file_name).group(1)) if 'batch_' in table_file_name else ""
|
| 628 |
batch_size_number = int(re.search(r'size_(\d+)_', table_file_name).group(1)) if 'size_' in table_file_name else ""
|
| 629 |
in_column_cleaned = re.search(r'col_(.*?)_reference', table_file_name).group(1) if 'col_' in table_file_name else ""
|
|
|
|
| 650 |
if do_summaries == "Yes":
|
| 651 |
for summary_no in summary_loop:
|
| 652 |
|
|
|
|
|
|
|
| 653 |
summary_text = topic_summary_df.to_markdown(index=False)
|
| 654 |
+
|
| 655 |
formatted_summary_prompt = [summarise_everything_prompt.format(topic_summary_table=summary_text, summary_format=comprehensive_summary_format_prompt)]
|
| 656 |
|
|
|
|
|
|
|
| 657 |
try:
|
| 658 |
response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, summarise_topic_descriptions_system_prompt, local_model)
|
| 659 |
summarised_output = response
|
|
|
|
| 674 |
toc = time.perf_counter()
|
| 675 |
time_taken = tic - toc
|
| 676 |
|
| 677 |
+
# Define the output file path for the output
|
| 678 |
+
print("batch_file_path_details just before save:", batch_file_path_details)
|
| 679 |
+
overall_summary_output_path = output_folder + batch_file_path_details + "_overall_summary_" + model_choice_clean_short + "_temp_" + str(temperature) + ".txt"
|
| 680 |
|
| 681 |
# Write the formatted prompt to the specified file
|
| 682 |
try:
|
| 683 |
+
with open(overall_summary_output_path, "w", encoding='utf-8', errors='replace') as f:
|
| 684 |
f.write(summarised_output)
|
| 685 |
+
output_files.append(overall_summary_output_path)
|
| 686 |
except Exception as e:
|
| 687 |
+
print(f"Error writing prompt to file {overall_summary_output_path}: {e}")
|
| 688 |
|
| 689 |
output_files = list(set(output_files))
|
| 690 |
|
tools/helper_functions.py
CHANGED
|
@@ -32,7 +32,6 @@ def empty_output_vars_extract_topics():
|
|
| 32 |
|
| 33 |
return master_topic_df_state, master_topic_summary_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown, summary_output_file_list, summary_input_file_list, overall_summarisation_input_files, overall_summary_output_files
|
| 34 |
|
| 35 |
-
|
| 36 |
def empty_output_vars_summarise():
|
| 37 |
# Empty output objects before summarising files
|
| 38 |
|
|
@@ -47,7 +46,7 @@ def empty_output_vars_summarise():
|
|
| 47 |
|
| 48 |
return summary_reference_table_sample_state, master_topic_summary_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, overall_summarisation_input_files
|
| 49 |
|
| 50 |
-
def get_or_create_env_var(var_name, default_value):
|
| 51 |
# Get the environment variable if it exists
|
| 52 |
value = os.environ.get(var_name)
|
| 53 |
|
|
@@ -58,14 +57,14 @@ def get_or_create_env_var(var_name, default_value):
|
|
| 58 |
|
| 59 |
return value
|
| 60 |
|
| 61 |
-
def get_file_path_with_extension(file_path):
|
| 62 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
| 63 |
basename = os.path.basename(file_path)
|
| 64 |
|
| 65 |
# Return the basename with its extension
|
| 66 |
return basename
|
| 67 |
|
| 68 |
-
def get_file_name_no_ext(file_path):
|
| 69 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
| 70 |
basename = os.path.basename(file_path)
|
| 71 |
|
|
@@ -76,7 +75,7 @@ def get_file_name_no_ext(file_path):
|
|
| 76 |
|
| 77 |
return filename_without_extension
|
| 78 |
|
| 79 |
-
def detect_file_type(filename):
|
| 80 |
"""Detect the file type based on its extension."""
|
| 81 |
if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
|
| 82 |
return 'csv'
|
|
@@ -232,7 +231,6 @@ def join_cols_onto_reference_df(reference_df:pd.DataFrame, original_data_df:pd.D
|
|
| 232 |
|
| 233 |
return out_reference_df, file_data_outputs
|
| 234 |
|
| 235 |
-
|
| 236 |
def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str], verify_titles:bool=False) -> pd.DataFrame:
|
| 237 |
|
| 238 |
if not isinstance(chosen_cols, list):
|
|
@@ -253,9 +251,7 @@ def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str], verif
|
|
| 253 |
basic_response_data = basic_response_data[['Reference', 'Response', 'Original Reference']]
|
| 254 |
|
| 255 |
basic_response_data["Response"] = basic_response_data["Response"].str.strip()
|
| 256 |
-
basic_response_data["Response"] = basic_response_data["Response"].apply(initial_clean)
|
| 257 |
-
|
| 258 |
-
print("basic_response_data:", basic_response_data)
|
| 259 |
|
| 260 |
return basic_response_data
|
| 261 |
|
|
@@ -291,7 +287,10 @@ def convert_reference_table_to_pivot_table(df:pd.DataFrame, basic_response_data:
|
|
| 291 |
|
| 292 |
def create_topic_summary_df_from_reference_table(reference_df:pd.DataFrame):
|
| 293 |
|
| 294 |
-
|
|
|
|
|
|
|
|
|
|
| 295 |
.agg({
|
| 296 |
'Response References': 'size', # Count the number of references
|
| 297 |
'Summary': lambda x: '<br>'.join(
|
|
@@ -299,12 +298,14 @@ def create_topic_summary_df_from_reference_table(reference_df:pd.DataFrame):
|
|
| 299 |
)
|
| 300 |
})
|
| 301 |
.reset_index()
|
| 302 |
-
|
| 303 |
.assign(Topic_number=lambda df: np.arange(1, len(df) + 1)) # Add numbering 1 to x
|
| 304 |
)
|
| 305 |
|
| 306 |
out_topic_summary_df = out_topic_summary_df.rename(columns={"Response References": "Number of responses"}, errors="ignore")
|
| 307 |
|
|
|
|
|
|
|
| 308 |
return out_topic_summary_df
|
| 309 |
|
| 310 |
# Wrap text in each column to the specified max width, including whole words
|
|
|
|
| 32 |
|
| 33 |
return master_topic_df_state, master_topic_summary_df_state, master_reference_df_state, text_output_file, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown, summary_output_file_list, summary_input_file_list, overall_summarisation_input_files, overall_summary_output_files
|
| 34 |
|
|
|
|
| 35 |
def empty_output_vars_summarise():
|
| 36 |
# Empty output objects before summarising files
|
| 37 |
|
|
|
|
| 46 |
|
| 47 |
return summary_reference_table_sample_state, master_topic_summary_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, overall_summarisation_input_files
|
| 48 |
|
| 49 |
+
def get_or_create_env_var(var_name:str, default_value:str):
|
| 50 |
# Get the environment variable if it exists
|
| 51 |
value = os.environ.get(var_name)
|
| 52 |
|
|
|
|
| 57 |
|
| 58 |
return value
|
| 59 |
|
| 60 |
+
def get_file_path_with_extension(file_path:str):
|
| 61 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
| 62 |
basename = os.path.basename(file_path)
|
| 63 |
|
| 64 |
# Return the basename with its extension
|
| 65 |
return basename
|
| 66 |
|
| 67 |
+
def get_file_name_no_ext(file_path:str):
|
| 68 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
| 69 |
basename = os.path.basename(file_path)
|
| 70 |
|
|
|
|
| 75 |
|
| 76 |
return filename_without_extension
|
| 77 |
|
| 78 |
+
def detect_file_type(filename:str):
|
| 79 |
"""Detect the file type based on its extension."""
|
| 80 |
if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
|
| 81 |
return 'csv'
|
|
|
|
| 231 |
|
| 232 |
return out_reference_df, file_data_outputs
|
| 233 |
|
|
|
|
| 234 |
def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str], verify_titles:bool=False) -> pd.DataFrame:
|
| 235 |
|
| 236 |
if not isinstance(chosen_cols, list):
|
|
|
|
| 251 |
basic_response_data = basic_response_data[['Reference', 'Response', 'Original Reference']]
|
| 252 |
|
| 253 |
basic_response_data["Response"] = basic_response_data["Response"].str.strip()
|
| 254 |
+
basic_response_data["Response"] = basic_response_data["Response"].apply(initial_clean)
|
|
|
|
|
|
|
| 255 |
|
| 256 |
return basic_response_data
|
| 257 |
|
|
|
|
| 287 |
|
| 288 |
def create_topic_summary_df_from_reference_table(reference_df:pd.DataFrame):
|
| 289 |
|
| 290 |
+
if "Group" not in reference_df.columns:
|
| 291 |
+
reference_df["Group"] = "All"
|
| 292 |
+
|
| 293 |
+
out_topic_summary_df = (reference_df.groupby(["General Topic", "Subtopic", "Sentiment", "Group"])
|
| 294 |
.agg({
|
| 295 |
'Response References': 'size', # Count the number of references
|
| 296 |
'Summary': lambda x: '<br>'.join(
|
|
|
|
| 298 |
)
|
| 299 |
})
|
| 300 |
.reset_index()
|
| 301 |
+
#.sort_values('Response References', ascending=False) # Sort by size, biggest first
|
| 302 |
.assign(Topic_number=lambda df: np.arange(1, len(df) + 1)) # Add numbering 1 to x
|
| 303 |
)
|
| 304 |
|
| 305 |
out_topic_summary_df = out_topic_summary_df.rename(columns={"Response References": "Number of responses"}, errors="ignore")
|
| 306 |
|
| 307 |
+
out_topic_summary_df = out_topic_summary_df.sort_values(["Group", "Number of responses", "General Topic", "Subtopic", "Sentiment"], ascending=[True, False, True, True, True])
|
| 308 |
+
|
| 309 |
return out_topic_summary_df
|
| 310 |
|
| 311 |
# Wrap text in each column to the specified max width, including whole words
|
tools/llm_api_call.py
CHANGED
|
@@ -421,7 +421,6 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
|
|
| 421 |
reference_table_out_path = "reference_table_error.csv"
|
| 422 |
topic_summary_df_out_path = "unique_topic_table_error.csv"
|
| 423 |
topic_with_response_df = pd.DataFrame()
|
| 424 |
-
markdown_table = ""
|
| 425 |
out_reference_df = pd.DataFrame()
|
| 426 |
out_topic_summary_df = pd.DataFrame()
|
| 427 |
batch_file_path_details = "error"
|
|
@@ -461,7 +460,7 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
|
|
| 461 |
topic_with_response_df, is_error = convert_response_text_to_dataframe(response_text)
|
| 462 |
except Exception as e:
|
| 463 |
print("Error in parsing markdown table from response text:", e)
|
| 464 |
-
return topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_with_response_df,
|
| 465 |
|
| 466 |
# Rename columns to ensure consistent use of data frames later in code
|
| 467 |
new_column_names = {
|
|
@@ -607,9 +606,11 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
|
|
| 607 |
|
| 608 |
out_topic_summary_df = out_topic_summary_df.rename(columns={"Response References":"Number of responses"}, errors="ignore")
|
| 609 |
|
|
|
|
|
|
|
| 610 |
topic_summary_df_out_path = output_folder + batch_file_path_details + "_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
|
| 611 |
|
| 612 |
-
return topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_with_response_df,
|
| 613 |
|
| 614 |
def generate_zero_shot_topics_df(zero_shot_topics:pd.DataFrame,
|
| 615 |
force_zero_shot_radio:str="No",
|
|
@@ -988,7 +989,7 @@ def extract_topics(in_data_file,
|
|
| 988 |
full_prompt = formatted_system_prompt + formatted_summary_prompt
|
| 989 |
|
| 990 |
# Define the output file path for the formatted prompt
|
| 991 |
-
formatted_prompt_output_path = output_folder +
|
| 992 |
|
| 993 |
# Write the formatted prompt to the specified file
|
| 994 |
try:
|
|
@@ -1009,7 +1010,7 @@ def extract_topics(in_data_file,
|
|
| 1009 |
responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
|
| 1010 |
|
| 1011 |
# Return output tables
|
| 1012 |
-
topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, new_topic_df,
|
| 1013 |
|
| 1014 |
# Write final output to text file for logging purposes
|
| 1015 |
try:
|
|
@@ -1046,6 +1047,8 @@ def extract_topics(in_data_file,
|
|
| 1046 |
## Unique topic list
|
| 1047 |
new_topic_summary_df = pd.concat([new_topic_summary_df, existing_topic_summary_df]).drop_duplicates('Subtopic')
|
| 1048 |
|
|
|
|
|
|
|
| 1049 |
new_topic_summary_df.to_csv(topic_summary_df_out_path, index=None)
|
| 1050 |
out_file_paths.append(topic_summary_df_out_path)
|
| 1051 |
|
|
@@ -1101,7 +1104,7 @@ def extract_topics(in_data_file,
|
|
| 1101 |
responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS)
|
| 1102 |
|
| 1103 |
|
| 1104 |
-
topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_table_df,
|
| 1105 |
|
| 1106 |
# If error in table parsing, leave function
|
| 1107 |
if is_error == True:
|
|
@@ -1121,6 +1124,8 @@ def extract_topics(in_data_file,
|
|
| 1121 |
|
| 1122 |
new_topic_summary_df = pd.concat([new_topic_summary_df, existing_topic_summary_df]).drop_duplicates('Subtopic')
|
| 1123 |
|
|
|
|
|
|
|
| 1124 |
new_topic_summary_df.to_csv(topic_summary_df_out_path, index=None)
|
| 1125 |
out_file_paths.append(topic_summary_df_out_path)
|
| 1126 |
|
|
@@ -1131,20 +1136,23 @@ def extract_topics(in_data_file,
|
|
| 1131 |
|
| 1132 |
# Write final output to text file also
|
| 1133 |
try:
|
| 1134 |
-
final_table_output_path = output_folder + batch_file_path_details + "_full_final_response_" + model_choice_clean + "_temp_" + str(temperature) + ".txt"
|
| 1135 |
-
|
| 1136 |
-
if isinstance(responses[-1], ResponseObject):
|
| 1137 |
-
|
| 1138 |
-
|
| 1139 |
-
|
| 1140 |
-
elif "choices" in responses[-1]:
|
| 1141 |
-
|
| 1142 |
-
|
| 1143 |
-
|
| 1144 |
-
else:
|
| 1145 |
-
|
| 1146 |
-
|
| 1147 |
-
|
|
|
|
|
|
|
|
|
|
| 1148 |
|
| 1149 |
log_files_output_paths.append(final_table_output_path)
|
| 1150 |
|
|
@@ -1203,7 +1211,7 @@ def extract_topics(in_data_file,
|
|
| 1203 |
|
| 1204 |
print("All summaries completed. Creating outputs.")
|
| 1205 |
|
| 1206 |
-
|
| 1207 |
# Example usage
|
| 1208 |
in_column_cleaned = clean_column_name(chosen_cols, max_length=20)
|
| 1209 |
|
|
@@ -1214,14 +1222,13 @@ def extract_topics(in_data_file,
|
|
| 1214 |
file_path_details = f"{file_name_cleaned}_col_{in_column_cleaned}"
|
| 1215 |
|
| 1216 |
# Create a pivoted reference table
|
| 1217 |
-
existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
|
| 1218 |
|
| 1219 |
# Save the new DataFrame to CSV
|
| 1220 |
-
|
| 1221 |
-
|
| 1222 |
-
|
| 1223 |
-
|
| 1224 |
-
basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
|
| 1225 |
|
| 1226 |
## Reference table mapping response numbers to topics
|
| 1227 |
existing_reference_df.to_csv(reference_table_out_path, index=None)
|
|
@@ -1230,30 +1237,33 @@ def extract_topics(in_data_file,
|
|
| 1230 |
|
| 1231 |
# Create final unique topics table from reference table to ensure consistent numbers
|
| 1232 |
final_out_topic_summary_df = create_topic_summary_df_from_reference_table(existing_reference_df)
|
|
|
|
| 1233 |
|
| 1234 |
## Unique topic list
|
| 1235 |
final_out_topic_summary_df.to_csv(topic_summary_df_out_path, index=None, encoding='utf-8')
|
| 1236 |
out_file_paths.append(topic_summary_df_out_path)
|
| 1237 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1238 |
# Ensure that we are only returning the final results to outputs
|
| 1239 |
out_file_paths = [x for x in out_file_paths if '_final_' in x]
|
| 1240 |
|
| 1241 |
## Reference table mapping response numbers to topics
|
|
|
|
| 1242 |
existing_reference_df_pivot.to_csv(reference_table_out_pivot_path, index = None, encoding='utf-8')
|
| 1243 |
log_files_output_paths.append(reference_table_out_pivot_path)
|
| 1244 |
|
| 1245 |
## Create a dataframe for missing response references:
|
| 1246 |
# Assuming existing_reference_df and file_data are already defined
|
| 1247 |
-
# Simplify table to just responses column and the Response reference number
|
| 1248 |
-
|
| 1249 |
basic_response_data = get_basic_response_data(file_data, chosen_cols)
|
| 1250 |
|
| 1251 |
-
|
| 1252 |
# Save simplified file data to log outputs
|
| 1253 |
pd.DataFrame(basic_response_data).to_csv(basic_response_data_out_path, index=None, encoding='utf-8')
|
| 1254 |
log_files_output_paths.append(basic_response_data_out_path)
|
| 1255 |
|
| 1256 |
-
|
| 1257 |
# Step 1: Identify missing references
|
| 1258 |
missing_references = basic_response_data[~basic_response_data['Reference'].astype(str).isin(existing_reference_df['Response References'].astype(str).unique())]
|
| 1259 |
|
|
@@ -1267,7 +1277,7 @@ def extract_topics(in_data_file,
|
|
| 1267 |
# Display the new DataFrame
|
| 1268 |
#print("missing_df:", missing_df)
|
| 1269 |
|
| 1270 |
-
missing_df_out_path = output_folder + file_path_details + "_missing_references_" +
|
| 1271 |
missing_df.to_csv(missing_df_out_path, index=None, encoding='utf-8')
|
| 1272 |
log_files_output_paths.append(missing_df_out_path)
|
| 1273 |
|
|
@@ -1281,10 +1291,10 @@ def extract_topics(in_data_file,
|
|
| 1281 |
|
| 1282 |
print("latest_batch_completed at end of batch iterations to return is", latest_batch_completed)
|
| 1283 |
|
| 1284 |
-
return unique_table_df_display_table_markdown, existing_topics_table, final_out_topic_summary_df, existing_reference_df, final_out_file_paths, final_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, final_out_file_paths, final_out_file_paths, modifiable_topic_summary_df, final_out_file_paths, join_file_paths # gr.Dataframe(value=modifiable_topic_summary_df, headers=None, col_count=(modifiable_topic_summary_df.shape[1], "fixed"), row_count = (modifiable_topic_summary_df.shape[0], "fixed"), visible=True, type="pandas"),
|
| 1285 |
|
| 1286 |
|
| 1287 |
-
return unique_table_df_display_table_markdown, existing_topics_table, existing_topic_summary_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, modifiable_topic_summary_df, out_file_paths, join_file_paths # gr.Dataframe(value=modifiable_topic_summary_df, headers=None, col_count=(modifiable_topic_summary_df.shape[1], "fixed"), row_count = (modifiable_topic_summary_df.shape[0], "fixed"), visible=True, type="pandas"),
|
| 1288 |
|
| 1289 |
def wrapper_extract_topics_per_column_value(
|
| 1290 |
selected_col: str,
|
|
@@ -1350,6 +1360,7 @@ def wrapper_extract_topics_per_column_value(
|
|
| 1350 |
acc_topics_table = initial_existing_topics_table.copy()
|
| 1351 |
acc_reference_df = initial_existing_reference_df.copy()
|
| 1352 |
acc_topic_summary_df = initial_existing_topic_summary_df.copy()
|
|
|
|
| 1353 |
|
| 1354 |
# Lists are extended
|
| 1355 |
acc_out_file_paths = []
|
|
@@ -1365,7 +1376,7 @@ def wrapper_extract_topics_per_column_value(
|
|
| 1365 |
|
| 1366 |
wrapper_first_loop = initial_first_loop_state
|
| 1367 |
|
| 1368 |
-
for i, group_value in enumerate(unique_values):
|
| 1369 |
print(f"\nProcessing segment: {selected_col} = {group_value} ({i+1}/{len(unique_values)})")
|
| 1370 |
|
| 1371 |
filtered_file_data = file_data.copy()
|
|
@@ -1412,6 +1423,7 @@ def wrapper_extract_topics_per_column_value(
|
|
| 1412 |
seg_gradio_df,
|
| 1413 |
_seg_out_files5, # Often same as 1
|
| 1414 |
seg_join_files,
|
|
|
|
| 1415 |
) = extract_topics(
|
| 1416 |
in_data_file=in_data_file,
|
| 1417 |
file_data=filtered_file_data,
|
|
@@ -1460,9 +1472,9 @@ def wrapper_extract_topics_per_column_value(
|
|
| 1460 |
# Aggregate results
|
| 1461 |
# The DFs returned by extract_topics are already cumulative for *its own run*.
|
| 1462 |
# We now make them cumulative for the *wrapper's run*.
|
| 1463 |
-
|
| 1464 |
-
|
| 1465 |
-
|
| 1466 |
|
| 1467 |
# For lists, extend. Use set to remove duplicates if paths might be re-added.
|
| 1468 |
acc_out_file_paths.extend(f for f in seg_out_files1 if f not in acc_out_file_paths)
|
|
@@ -1484,6 +1496,32 @@ def wrapper_extract_topics_per_column_value(
|
|
| 1484 |
# Optionally, decide if you want to continue with other segments or stop
|
| 1485 |
# For now, it will continue
|
| 1486 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1487 |
|
| 1488 |
print(f"\nWrapper finished processing all segments. Total time: {acc_total_time_taken:.2f}s")
|
| 1489 |
|
|
|
|
| 421 |
reference_table_out_path = "reference_table_error.csv"
|
| 422 |
topic_summary_df_out_path = "unique_topic_table_error.csv"
|
| 423 |
topic_with_response_df = pd.DataFrame()
|
|
|
|
| 424 |
out_reference_df = pd.DataFrame()
|
| 425 |
out_topic_summary_df = pd.DataFrame()
|
| 426 |
batch_file_path_details = "error"
|
|
|
|
| 460 |
topic_with_response_df, is_error = convert_response_text_to_dataframe(response_text)
|
| 461 |
except Exception as e:
|
| 462 |
print("Error in parsing markdown table from response text:", e)
|
| 463 |
+
return topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_with_response_df, out_reference_df, out_topic_summary_df, batch_file_path_details, is_error
|
| 464 |
|
| 465 |
# Rename columns to ensure consistent use of data frames later in code
|
| 466 |
new_column_names = {
|
|
|
|
| 606 |
|
| 607 |
out_topic_summary_df = out_topic_summary_df.rename(columns={"Response References":"Number of responses"}, errors="ignore")
|
| 608 |
|
| 609 |
+
out_topic_summary_df["Group"] = group_name
|
| 610 |
+
|
| 611 |
topic_summary_df_out_path = output_folder + batch_file_path_details + "_unique_topics_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
|
| 612 |
|
| 613 |
+
return topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_with_response_df, out_reference_df, out_topic_summary_df, batch_file_path_details, is_error
|
| 614 |
|
| 615 |
def generate_zero_shot_topics_df(zero_shot_topics:pd.DataFrame,
|
| 616 |
force_zero_shot_radio:str="No",
|
|
|
|
| 989 |
full_prompt = formatted_system_prompt + formatted_summary_prompt
|
| 990 |
|
| 991 |
# Define the output file path for the formatted prompt
|
| 992 |
+
formatted_prompt_output_path = output_folder + batch_file_path_details + "_full_prompt_" + clean_column_name(model_choice_clean, max_length = 20, front_characters=False) + "_temp_" + str(temperature) + ".txt"
|
| 993 |
|
| 994 |
# Write the formatted prompt to the specified file
|
| 995 |
try:
|
|
|
|
| 1010 |
responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
|
| 1011 |
|
| 1012 |
# Return output tables
|
| 1013 |
+
topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, new_topic_df, new_reference_df, new_topic_summary_df, master_batch_out_file_part, is_error = write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, group_name, first_run=False, output_folder=output_folder)
|
| 1014 |
|
| 1015 |
# Write final output to text file for logging purposes
|
| 1016 |
try:
|
|
|
|
| 1047 |
## Unique topic list
|
| 1048 |
new_topic_summary_df = pd.concat([new_topic_summary_df, existing_topic_summary_df]).drop_duplicates('Subtopic')
|
| 1049 |
|
| 1050 |
+
new_topic_summary_df["Group"] = group_name
|
| 1051 |
+
|
| 1052 |
new_topic_summary_df.to_csv(topic_summary_df_out_path, index=None)
|
| 1053 |
out_file_paths.append(topic_summary_df_out_path)
|
| 1054 |
|
|
|
|
| 1104 |
responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS)
|
| 1105 |
|
| 1106 |
|
| 1107 |
+
topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_table_df, reference_df, new_topic_summary_df, batch_file_path_details, is_error = write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, group_name, first_run=True, output_folder=output_folder)
|
| 1108 |
|
| 1109 |
# If error in table parsing, leave function
|
| 1110 |
if is_error == True:
|
|
|
|
| 1124 |
|
| 1125 |
new_topic_summary_df = pd.concat([new_topic_summary_df, existing_topic_summary_df]).drop_duplicates('Subtopic')
|
| 1126 |
|
| 1127 |
+
new_topic_summary_df["Group"] = group_name
|
| 1128 |
+
|
| 1129 |
new_topic_summary_df.to_csv(topic_summary_df_out_path, index=None)
|
| 1130 |
out_file_paths.append(topic_summary_df_out_path)
|
| 1131 |
|
|
|
|
| 1136 |
|
| 1137 |
# Write final output to text file also
|
| 1138 |
try:
|
| 1139 |
+
final_table_output_path = output_folder + batch_file_path_details + "_full_final_response_" + clean_column_name(model_choice_clean, max_length = 20, front_characters=False) + "_temp_" + str(temperature) + ".txt"
|
| 1140 |
+
|
| 1141 |
+
# if isinstance(responses[-1], ResponseObject):
|
| 1142 |
+
# with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
| 1143 |
+
# f.write(responses[-1].text)
|
| 1144 |
+
# unique_table_df_display_table_markdown = responses[-1].text
|
| 1145 |
+
# elif "choices" in responses[-1]:
|
| 1146 |
+
# with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
| 1147 |
+
# f.write(responses[-1]["choices"][0]['text'])
|
| 1148 |
+
# unique_table_df_display_table_markdown =responses[-1]["choices"][0]['text']
|
| 1149 |
+
# else:
|
| 1150 |
+
# with open(final_table_output_path, "w", encoding='utf-8', errors='replace') as f:
|
| 1151 |
+
# f.write(responses[-1].text)
|
| 1152 |
+
# unique_table_df_display_table_markdown = responses[-1].text
|
| 1153 |
+
|
| 1154 |
+
unique_table_df_display_table = new_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
|
| 1155 |
+
unique_table_df_display_table_markdown = unique_table_df_display_table[["General Topic", "Subtopic", "Sentiment", "Number of responses", "Summary"]].to_markdown(index=False)
|
| 1156 |
|
| 1157 |
log_files_output_paths.append(final_table_output_path)
|
| 1158 |
|
|
|
|
| 1211 |
|
| 1212 |
print("All summaries completed. Creating outputs.")
|
| 1213 |
|
| 1214 |
+
model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
|
| 1215 |
# Example usage
|
| 1216 |
in_column_cleaned = clean_column_name(chosen_cols, max_length=20)
|
| 1217 |
|
|
|
|
| 1222 |
file_path_details = f"{file_name_cleaned}_col_{in_column_cleaned}"
|
| 1223 |
|
| 1224 |
# Create a pivoted reference table
|
| 1225 |
+
existing_reference_df_pivot = convert_reference_table_to_pivot_table(existing_reference_df)
|
| 1226 |
|
| 1227 |
# Save the new DataFrame to CSV
|
| 1228 |
+
reference_table_out_pivot_path = output_folder + file_path_details + "_final_reference_table_pivot_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
|
| 1229 |
+
reference_table_out_path = output_folder + file_path_details + "_final_reference_table_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
|
| 1230 |
+
topic_summary_df_out_path = output_folder + file_path_details + "_final_unique_topics_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
|
| 1231 |
+
basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
|
|
|
|
| 1232 |
|
| 1233 |
## Reference table mapping response numbers to topics
|
| 1234 |
existing_reference_df.to_csv(reference_table_out_path, index=None)
|
|
|
|
| 1237 |
|
| 1238 |
# Create final unique topics table from reference table to ensure consistent numbers
|
| 1239 |
final_out_topic_summary_df = create_topic_summary_df_from_reference_table(existing_reference_df)
|
| 1240 |
+
final_out_topic_summary_df["Group"] = group_name
|
| 1241 |
|
| 1242 |
## Unique topic list
|
| 1243 |
final_out_topic_summary_df.to_csv(topic_summary_df_out_path, index=None, encoding='utf-8')
|
| 1244 |
out_file_paths.append(topic_summary_df_out_path)
|
| 1245 |
|
| 1246 |
+
# Outputs for markdown table output
|
| 1247 |
+
unique_table_df_display_table = final_out_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
|
| 1248 |
+
unique_table_df_display_table_markdown = unique_table_df_display_table[["General Topic", "Subtopic", "Sentiment", "Number of responses", "Summary", "Group"]].to_markdown(index=False)
|
| 1249 |
+
|
| 1250 |
# Ensure that we are only returning the final results to outputs
|
| 1251 |
out_file_paths = [x for x in out_file_paths if '_final_' in x]
|
| 1252 |
|
| 1253 |
## Reference table mapping response numbers to topics
|
| 1254 |
+
existing_reference_df_pivot["Group"] = group_name
|
| 1255 |
existing_reference_df_pivot.to_csv(reference_table_out_pivot_path, index = None, encoding='utf-8')
|
| 1256 |
log_files_output_paths.append(reference_table_out_pivot_path)
|
| 1257 |
|
| 1258 |
## Create a dataframe for missing response references:
|
| 1259 |
# Assuming existing_reference_df and file_data are already defined
|
| 1260 |
+
# Simplify table to just responses column and the Response reference number
|
|
|
|
| 1261 |
basic_response_data = get_basic_response_data(file_data, chosen_cols)
|
| 1262 |
|
|
|
|
| 1263 |
# Save simplified file data to log outputs
|
| 1264 |
pd.DataFrame(basic_response_data).to_csv(basic_response_data_out_path, index=None, encoding='utf-8')
|
| 1265 |
log_files_output_paths.append(basic_response_data_out_path)
|
| 1266 |
|
|
|
|
| 1267 |
# Step 1: Identify missing references
|
| 1268 |
missing_references = basic_response_data[~basic_response_data['Reference'].astype(str).isin(existing_reference_df['Response References'].astype(str).unique())]
|
| 1269 |
|
|
|
|
| 1277 |
# Display the new DataFrame
|
| 1278 |
#print("missing_df:", missing_df)
|
| 1279 |
|
| 1280 |
+
missing_df_out_path = output_folder + file_path_details + "_missing_references_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
|
| 1281 |
missing_df.to_csv(missing_df_out_path, index=None, encoding='utf-8')
|
| 1282 |
log_files_output_paths.append(missing_df_out_path)
|
| 1283 |
|
|
|
|
| 1291 |
|
| 1292 |
print("latest_batch_completed at end of batch iterations to return is", latest_batch_completed)
|
| 1293 |
|
| 1294 |
+
return unique_table_df_display_table_markdown, existing_topics_table, final_out_topic_summary_df, existing_reference_df, final_out_file_paths, final_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, final_out_file_paths, final_out_file_paths, modifiable_topic_summary_df, final_out_file_paths, join_file_paths, existing_reference_df_pivot # gr.Dataframe(value=modifiable_topic_summary_df, headers=None, col_count=(modifiable_topic_summary_df.shape[1], "fixed"), row_count = (modifiable_topic_summary_df.shape[0], "fixed"), visible=True, type="pandas"),
|
| 1295 |
|
| 1296 |
|
| 1297 |
+
return unique_table_df_display_table_markdown, existing_topics_table, existing_topic_summary_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, modifiable_topic_summary_df, out_file_paths, join_file_paths, existing_reference_df_pivot # gr.Dataframe(value=modifiable_topic_summary_df, headers=None, col_count=(modifiable_topic_summary_df.shape[1], "fixed"), row_count = (modifiable_topic_summary_df.shape[0], "fixed"), visible=True, type="pandas"),
|
| 1298 |
|
| 1299 |
def wrapper_extract_topics_per_column_value(
|
| 1300 |
selected_col: str,
|
|
|
|
| 1360 |
acc_topics_table = initial_existing_topics_table.copy()
|
| 1361 |
acc_reference_df = initial_existing_reference_df.copy()
|
| 1362 |
acc_topic_summary_df = initial_existing_topic_summary_df.copy()
|
| 1363 |
+
acc_reference_df_pivot = pd.DataFrame()
|
| 1364 |
|
| 1365 |
# Lists are extended
|
| 1366 |
acc_out_file_paths = []
|
|
|
|
| 1376 |
|
| 1377 |
wrapper_first_loop = initial_first_loop_state
|
| 1378 |
|
| 1379 |
+
for i, group_value in tqdm(enumerate(unique_values), desc=f"Analysing by group", total=len(unique_values), unit="groups"):
|
| 1380 |
print(f"\nProcessing segment: {selected_col} = {group_value} ({i+1}/{len(unique_values)})")
|
| 1381 |
|
| 1382 |
filtered_file_data = file_data.copy()
|
|
|
|
| 1423 |
seg_gradio_df,
|
| 1424 |
_seg_out_files5, # Often same as 1
|
| 1425 |
seg_join_files,
|
| 1426 |
+
seg_reference_df_pivot
|
| 1427 |
) = extract_topics(
|
| 1428 |
in_data_file=in_data_file,
|
| 1429 |
file_data=filtered_file_data,
|
|
|
|
| 1472 |
# Aggregate results
|
| 1473 |
# The DFs returned by extract_topics are already cumulative for *its own run*.
|
| 1474 |
# We now make them cumulative for the *wrapper's run*.
|
| 1475 |
+
acc_reference_df = pd.concat([acc_reference_df, seg_reference_df])
|
| 1476 |
+
acc_topic_summary_df = pd.concat([acc_topic_summary_df, seg_topic_summary_df])
|
| 1477 |
+
acc_reference_df_pivot = pd.concat([acc_reference_df_pivot, seg_reference_df_pivot])
|
| 1478 |
|
| 1479 |
# For lists, extend. Use set to remove duplicates if paths might be re-added.
|
| 1480 |
acc_out_file_paths.extend(f for f in seg_out_files1 if f not in acc_out_file_paths)
|
|
|
|
| 1496 |
# Optionally, decide if you want to continue with other segments or stop
|
| 1497 |
# For now, it will continue
|
| 1498 |
continue
|
| 1499 |
+
|
| 1500 |
+
if "Group" in acc_reference_df.columns:
|
| 1501 |
+
model_choice_clean = model_name_map[model_choice]
|
| 1502 |
+
model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
|
| 1503 |
+
overall_file_name = f"{clean_column_name(original_file_name, max_length=30)}_"
|
| 1504 |
+
|
| 1505 |
+
acc_reference_df_path = output_folder + overall_file_name + "all_reference_table_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
|
| 1506 |
+
acc_topic_summary_df_path = output_folder + overall_file_name + "all_unique_topics_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
|
| 1507 |
+
acc_reference_df_pivot_path = output_folder + overall_file_name + "all_reference_pivot_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
|
| 1508 |
+
|
| 1509 |
+
acc_reference_df.to_csv(acc_reference_df_path, index=None)
|
| 1510 |
+
acc_topic_summary_df.to_csv(acc_topic_summary_df_path, index=None)
|
| 1511 |
+
acc_reference_df_pivot.to_csv(acc_reference_df_pivot_path, index=None)
|
| 1512 |
+
|
| 1513 |
+
# Remove the existing output file list and replace with the updated concatenated outputs
|
| 1514 |
+
substring_list_to_remove = ["_final_reference_table_pivot_", "_final_reference_table_", "_final_unique_topics_"]
|
| 1515 |
+
acc_out_file_paths = [
|
| 1516 |
+
x for x in acc_out_file_paths
|
| 1517 |
+
if not any(sub in x for sub in substring_list_to_remove)
|
| 1518 |
+
]
|
| 1519 |
+
|
| 1520 |
+
acc_out_file_paths.extend([acc_reference_df_path, acc_topic_summary_df_path])
|
| 1521 |
+
|
| 1522 |
+
# Outputs for markdown table output
|
| 1523 |
+
unique_table_df_display_table = acc_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
|
| 1524 |
+
acc_markdown_output = unique_table_df_display_table[["General Topic", "Subtopic", "Sentiment", "Number of responses", "Summary", "Group"]].to_markdown(index=False)
|
| 1525 |
|
| 1526 |
print(f"\nWrapper finished processing all segments. Total time: {acc_total_time_taken:.2f}s")
|
| 1527 |
|
tools/prompts.py
CHANGED
|
@@ -85,7 +85,7 @@ Your task is to summarise the above table in markdown format. {summary_format}.
|
|
| 85 |
|
| 86 |
Summary:"""
|
| 87 |
|
| 88 |
-
comprehensive_summary_format_prompt = "Return a comprehensive summary that covers all the important topics and themes described in the table"
|
| 89 |
|
| 90 |
|
| 91 |
### Verify exisiting categories prompt
|
|
|
|
| 85 |
|
| 86 |
Summary:"""
|
| 87 |
|
| 88 |
+
comprehensive_summary_format_prompt = "Return a comprehensive summary that covers all the important topics and themes described in the table. If there are different values in the Group column of the data, compare and contrast differences between the topics and themes from each Group."
|
| 89 |
|
| 90 |
|
| 91 |
### Verify exisiting categories prompt
|