Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
92003de
1
Parent(s):
f2d85f1
Further improved grouping implementation, improved summarisation prompts
Browse files- app.py +5 -5
- tools/aws_functions.py +2 -2
- tools/config.py +2 -2
- tools/dedup_summaries.py +77 -26
- tools/llm_api_call.py +1 -1
- tools/prompts.py +30 -17
app.py
CHANGED
|
@@ -347,13 +347,13 @@ with app:
|
|
| 347 |
summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, overall_summarisation_input_files]).\
|
| 348 |
success(load_in_previous_data_files, inputs=[summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
|
| 349 |
success(sample_reference_table_summaries, inputs=[master_reference_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown], api_name="sample_summaries").\
|
| 350 |
-
success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], api_name="summarise_topics")
|
| 351 |
|
| 352 |
-
latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], scroll_to_output=True)
|
| 353 |
|
| 354 |
# SUMMARISE WHOLE TABLE PAGE
|
| 355 |
overall_summarise_previous_data_btn.click(load_in_previous_data_files, inputs=[overall_summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
|
| 356 |
-
success(overall_summary, inputs=[master_unique_topics_df_state, model_choice, in_api_key, temperature_slide, unique_topics_table_file_name_textbox,
|
| 357 |
|
| 358 |
###
|
| 359 |
# CONTINUE PREVIOUS TOPIC EXTRACTION PAGE
|
|
@@ -361,8 +361,8 @@ with app:
|
|
| 361 |
|
| 362 |
# If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
|
| 363 |
continue_previous_data_files_btn.click(
|
| 364 |
-
|
| 365 |
-
|
| 366 |
|
| 367 |
###
|
| 368 |
# VERIFY TEXT TITLES/DESCRIPTIONS
|
|
|
|
| 347 |
summarise_previous_data_btn.click(empty_output_vars_summarise, inputs=None, outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, overall_summarisation_input_files]).\
|
| 348 |
success(load_in_previous_data_files, inputs=[summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
|
| 349 |
success(sample_reference_table_summaries, inputs=[master_reference_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown], api_name="sample_summaries").\
|
| 350 |
+
success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state, context_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], api_name="summarise_topics")
|
| 351 |
|
| 352 |
+
latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, reference_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state, context_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files], scroll_to_output=True)
|
| 353 |
|
| 354 |
# SUMMARISE WHOLE TABLE PAGE
|
| 355 |
overall_summarise_previous_data_btn.click(load_in_previous_data_files, inputs=[overall_summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, reference_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
|
| 356 |
+
success(overall_summary, inputs=[master_unique_topics_df_state, model_choice, in_api_key, temperature_slide, unique_topics_table_file_name_textbox, output_folder_state, in_colnames, context_textbox], outputs=[overall_summary_output_files, overall_summarised_output_markdown], scroll_to_output=True, api_name="overall_summary")
|
| 357 |
|
| 358 |
###
|
| 359 |
# CONTINUE PREVIOUS TOPIC EXTRACTION PAGE
|
|
|
|
| 361 |
|
| 362 |
# If uploaded partially completed consultation files do this. This should then start up the 'latest_batch_completed' change action above to continue extracting topics.
|
| 363 |
continue_previous_data_files_btn.click(
|
| 364 |
+
load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches]).\
|
| 365 |
+
success(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, reference_data_file_name_textbox])
|
| 366 |
|
| 367 |
###
|
| 368 |
# VERIFY TEXT TITLES/DESCRIPTIONS
|
tools/aws_functions.py
CHANGED
|
@@ -3,7 +3,7 @@ import pandas as pd
|
|
| 3 |
import boto3
|
| 4 |
import tempfile
|
| 5 |
import os
|
| 6 |
-
from tools.config import RUN_AWS_FUNCTIONS, AWS_REGION,
|
| 7 |
|
| 8 |
PandasDataFrame = Type[pd.DataFrame]
|
| 9 |
|
|
@@ -12,7 +12,7 @@ bucket_name=""
|
|
| 12 |
|
| 13 |
if RUN_AWS_FUNCTIONS == "1":
|
| 14 |
try:
|
| 15 |
-
bucket_name =
|
| 16 |
session = boto3.Session() # profile_name="default"
|
| 17 |
except Exception as e:
|
| 18 |
print(e)
|
|
|
|
| 3 |
import boto3
|
| 4 |
import tempfile
|
| 5 |
import os
|
| 6 |
+
from tools.config import RUN_AWS_FUNCTIONS, AWS_REGION, S3_LOG_BUCKET
|
| 7 |
|
| 8 |
PandasDataFrame = Type[pd.DataFrame]
|
| 9 |
|
|
|
|
| 12 |
|
| 13 |
if RUN_AWS_FUNCTIONS == "1":
|
| 14 |
try:
|
| 15 |
+
bucket_name = S3_LOG_BUCKET
|
| 16 |
session = boto3.Session() # profile_name="default"
|
| 17 |
except Exception as e:
|
| 18 |
print(e)
|
tools/config.py
CHANGED
|
@@ -105,7 +105,7 @@ if AWS_ACCESS_KEY: print(f'AWS_ACCESS_KEY found in environment variables')
|
|
| 105 |
AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
|
| 106 |
if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
|
| 107 |
|
| 108 |
-
|
| 109 |
|
| 110 |
# Custom headers e.g. if routing traffic through Cloudfront
|
| 111 |
# Retrieving or setting CUSTOM_HEADER
|
|
@@ -320,7 +320,7 @@ GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', '')
|
|
| 320 |
|
| 321 |
ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_allow_list.csv
|
| 322 |
|
| 323 |
-
S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_allow_list.csv # This is a path within the
|
| 324 |
|
| 325 |
if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
|
| 326 |
else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
|
|
|
|
| 105 |
AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
|
| 106 |
if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
|
| 107 |
|
| 108 |
+
S3_LOG_BUCKET = get_or_create_env_var('S3_LOG_BUCKET', '')
|
| 109 |
|
| 110 |
# Custom headers e.g. if routing traffic through Cloudfront
|
| 111 |
# Retrieving or setting CUSTOM_HEADER
|
|
|
|
| 320 |
|
| 321 |
ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_allow_list.csv
|
| 322 |
|
| 323 |
+
S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_allow_list.csv # This is a path within the named S3 bucket
|
| 324 |
|
| 325 |
if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
|
| 326 |
else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
|
tools/dedup_summaries.py
CHANGED
|
@@ -7,7 +7,7 @@ import gradio as gr
|
|
| 7 |
import time
|
| 8 |
from tqdm import tqdm
|
| 9 |
|
| 10 |
-
from tools.prompts import summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, system_prompt, summarise_everything_prompt, comprehensive_summary_format_prompt
|
| 11 |
from tools.llm_funcs import construct_gemini_generative_model, process_requests, ResponseObject, load_model
|
| 12 |
from tools.helper_functions import create_topic_summary_df_from_reference_table, load_in_data_file, get_basic_response_data, convert_reference_table_to_pivot_table, wrap_text, clean_column_name
|
| 13 |
from tools.config import OUTPUT_FOLDER, RUN_LOCAL_MODEL, MAX_COMMENT_CHARS, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED
|
|
@@ -437,7 +437,8 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
| 437 |
chosen_cols:List[str]=[],
|
| 438 |
log_output_files:list[str]=[],
|
| 439 |
summarise_format_radio:str="Return a summary up to two paragraphs long that includes as much detail as possible from the original text",
|
| 440 |
-
output_folder:str=OUTPUT_FOLDER,
|
|
|
|
| 441 |
summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
|
| 442 |
do_summaries:str="Yes",
|
| 443 |
progress=gr.Progress(track_tqdm=True)):
|
|
@@ -566,8 +567,10 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
| 566 |
#print("summary_text:", summary_text)
|
| 567 |
formatted_summary_prompt = [summarise_topic_descriptions_prompt.format(summaries=summary_text, summary_format=summarise_format_radio)]
|
| 568 |
|
|
|
|
|
|
|
| 569 |
try:
|
| 570 |
-
response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt,
|
| 571 |
summarised_output = response
|
| 572 |
summarised_output = re.sub(r'\n{2,}', '\n', summarised_output) # Replace multiple line breaks with a single line break
|
| 573 |
summarised_output = re.sub(r'^\n{1,}', '', summarised_output) # Remove one or more line breaks at the start
|
|
@@ -584,7 +587,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
| 584 |
|
| 585 |
# Check if beyond max time allowed for processing and break if necessary
|
| 586 |
toc = time.perf_counter()
|
| 587 |
-
time_taken =
|
| 588 |
|
| 589 |
if time_taken > max_time_for_loop:
|
| 590 |
print("Time taken for loop is greater than maximum time allowed. Exiting and restarting loop")
|
|
@@ -594,7 +597,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
| 594 |
|
| 595 |
# If all summaries completeed
|
| 596 |
if latest_summary_completed >= length_all_summaries:
|
| 597 |
-
print("At last summary.")
|
| 598 |
|
| 599 |
output_files = list(set(output_files))
|
| 600 |
|
|
@@ -605,10 +608,13 @@ def overall_summary(topic_summary_df:pd.DataFrame,
|
|
| 605 |
in_api_key:str,
|
| 606 |
temperature:float,
|
| 607 |
table_file_name:str,
|
| 608 |
-
|
| 609 |
-
|
|
|
|
| 610 |
summarise_everything_prompt:str=summarise_everything_prompt,
|
| 611 |
comprehensive_summary_format_prompt:str=comprehensive_summary_format_prompt,
|
|
|
|
|
|
|
| 612 |
do_summaries:str="Yes",
|
| 613 |
progress=gr.Progress(track_tqdm=True)):
|
| 614 |
'''
|
|
@@ -616,17 +622,35 @@ def overall_summary(topic_summary_df:pd.DataFrame,
|
|
| 616 |
'''
|
| 617 |
|
| 618 |
out_metadata = []
|
| 619 |
-
local_model = []
|
| 620 |
-
length_all_summaries = 1
|
| 621 |
latest_summary_completed = 0
|
| 622 |
output_files = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 623 |
|
| 624 |
model_choice_clean = model_name_map[model_choice]
|
| 625 |
model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
|
| 626 |
file_name = re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name).group(1) if re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name) else table_file_name
|
| 627 |
latest_batch_completed = int(re.search(r'batch_(\d+)_', table_file_name).group(1)) if 'batch_' in table_file_name else ""
|
| 628 |
batch_size_number = int(re.search(r'size_(\d+)_', table_file_name).group(1)) if 'size_' in table_file_name else ""
|
| 629 |
-
in_column_cleaned = re.search(r'col_(.*?)
|
| 630 |
|
| 631 |
# Save outputs for each batch. If master file created, label file as master
|
| 632 |
if latest_batch_completed:
|
|
@@ -644,48 +668,75 @@ def overall_summary(topic_summary_df:pd.DataFrame,
|
|
| 644 |
local_model, tokenizer = load_model(local_model_type=CHOSEN_LOCAL_MODEL_TYPE, repo_id=LOCAL_REPO_ID, model_filename=LOCAL_MODEL_FILE, model_dir=LOCAL_MODEL_FOLDER)
|
| 645 |
#print("Local model loaded:", local_model)
|
| 646 |
|
| 647 |
-
|
| 648 |
-
summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")
|
| 649 |
|
| 650 |
if do_summaries == "Yes":
|
| 651 |
-
for
|
| 652 |
|
| 653 |
-
|
|
|
|
|
|
|
| 654 |
|
| 655 |
formatted_summary_prompt = [summarise_everything_prompt.format(topic_summary_table=summary_text, summary_format=comprehensive_summary_format_prompt)]
|
| 656 |
|
|
|
|
|
|
|
| 657 |
try:
|
| 658 |
-
response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt,
|
| 659 |
summarised_output = response
|
| 660 |
summarised_output = re.sub(r'\n{2,}', '\n', summarised_output) # Replace multiple line breaks with a single line break
|
| 661 |
summarised_output = re.sub(r'^\n{1,}', '', summarised_output) # Remove one or more line breaks at the start
|
| 662 |
summarised_output = summarised_output.strip()
|
| 663 |
except Exception as e:
|
| 664 |
-
print(e)
|
| 665 |
summarised_output = ""
|
| 666 |
|
| 667 |
summarised_outputs.append(summarised_output)
|
|
|
|
|
|
|
| 668 |
out_metadata.extend(metadata)
|
| 669 |
out_metadata_str = '. '.join(out_metadata)
|
| 670 |
|
| 671 |
latest_summary_completed += 1
|
| 672 |
|
| 673 |
-
|
| 674 |
-
toc = time.perf_counter()
|
| 675 |
-
time_taken = tic - toc
|
| 676 |
|
| 677 |
-
#
|
| 678 |
-
|
| 679 |
-
overall_summary_output_path = output_folder + batch_file_path_details + "_overall_summary_" + model_choice_clean_short + "_temp_" + str(temperature) + ".txt"
|
| 680 |
|
| 681 |
-
# Write
|
| 682 |
try:
|
| 683 |
with open(overall_summary_output_path, "w", encoding='utf-8', errors='replace') as f:
|
| 684 |
f.write(summarised_output)
|
| 685 |
-
output_files.append(overall_summary_output_path)
|
| 686 |
except Exception as e:
|
| 687 |
print(f"Error writing prompt to file {overall_summary_output_path}: {e}")
|
| 688 |
|
| 689 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 690 |
|
| 691 |
-
return output_files,
|
|
|
|
| 7 |
import time
|
| 8 |
from tqdm import tqdm
|
| 9 |
|
| 10 |
+
from tools.prompts import summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, system_prompt, summarise_everything_prompt, comprehensive_summary_format_prompt, summarise_everything_system_prompt, comprehensive_summary_format_prompt_by_group
|
| 11 |
from tools.llm_funcs import construct_gemini_generative_model, process_requests, ResponseObject, load_model
|
| 12 |
from tools.helper_functions import create_topic_summary_df_from_reference_table, load_in_data_file, get_basic_response_data, convert_reference_table_to_pivot_table, wrap_text, clean_column_name
|
| 13 |
from tools.config import OUTPUT_FOLDER, RUN_LOCAL_MODEL, MAX_COMMENT_CHARS, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED
|
|
|
|
| 437 |
chosen_cols:List[str]=[],
|
| 438 |
log_output_files:list[str]=[],
|
| 439 |
summarise_format_radio:str="Return a summary up to two paragraphs long that includes as much detail as possible from the original text",
|
| 440 |
+
output_folder:str=OUTPUT_FOLDER,
|
| 441 |
+
context_textbox:str="",
|
| 442 |
summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
|
| 443 |
do_summaries:str="Yes",
|
| 444 |
progress=gr.Progress(track_tqdm=True)):
|
|
|
|
| 567 |
#print("summary_text:", summary_text)
|
| 568 |
formatted_summary_prompt = [summarise_topic_descriptions_prompt.format(summaries=summary_text, summary_format=summarise_format_radio)]
|
| 569 |
|
| 570 |
+
formatted_summarise_topic_descriptions_system_prompt = summarise_topic_descriptions_system_prompt.format(column_name=chosen_cols[0],consultation_context=context_textbox)
|
| 571 |
+
|
| 572 |
try:
|
| 573 |
+
response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, formatted_summarise_topic_descriptions_system_prompt, local_model)
|
| 574 |
summarised_output = response
|
| 575 |
summarised_output = re.sub(r'\n{2,}', '\n', summarised_output) # Replace multiple line breaks with a single line break
|
| 576 |
summarised_output = re.sub(r'^\n{1,}', '', summarised_output) # Remove one or more line breaks at the start
|
|
|
|
| 587 |
|
| 588 |
# Check if beyond max time allowed for processing and break if necessary
|
| 589 |
toc = time.perf_counter()
|
| 590 |
+
time_taken = toc - tic
|
| 591 |
|
| 592 |
if time_taken > max_time_for_loop:
|
| 593 |
print("Time taken for loop is greater than maximum time allowed. Exiting and restarting loop")
|
|
|
|
| 597 |
|
| 598 |
# If all summaries completeed
|
| 599 |
if latest_summary_completed >= length_all_summaries:
|
| 600 |
+
print("At last summary. Time taken:", time_taken)
|
| 601 |
|
| 602 |
output_files = list(set(output_files))
|
| 603 |
|
|
|
|
| 608 |
in_api_key:str,
|
| 609 |
temperature:float,
|
| 610 |
table_file_name:str,
|
| 611 |
+
output_folder:str=OUTPUT_FOLDER,
|
| 612 |
+
chosen_cols:List[str]=[],
|
| 613 |
+
context_textbox:str="",
|
| 614 |
summarise_everything_prompt:str=summarise_everything_prompt,
|
| 615 |
comprehensive_summary_format_prompt:str=comprehensive_summary_format_prompt,
|
| 616 |
+
comprehensive_summary_format_prompt_by_group:str=comprehensive_summary_format_prompt_by_group,
|
| 617 |
+
summarise_everything_system_prompt:str=summarise_everything_system_prompt,
|
| 618 |
do_summaries:str="Yes",
|
| 619 |
progress=gr.Progress(track_tqdm=True)):
|
| 620 |
'''
|
|
|
|
| 622 |
'''
|
| 623 |
|
| 624 |
out_metadata = []
|
| 625 |
+
local_model = []
|
|
|
|
| 626 |
latest_summary_completed = 0
|
| 627 |
output_files = []
|
| 628 |
+
txt_summarised_outputs = []
|
| 629 |
+
summarised_outputs = []
|
| 630 |
+
|
| 631 |
+
if "Group" not in topic_summary_df.columns:
|
| 632 |
+
topic_summary_df["Group"] = "All"
|
| 633 |
+
|
| 634 |
+
|
| 635 |
+
topic_summary_df = topic_summary_df.sort_values(by=["Group", "Number of responses"], ascending=[True, False])
|
| 636 |
+
|
| 637 |
+
unique_groups = sorted(topic_summary_df["Group"].unique())
|
| 638 |
+
|
| 639 |
+
print("unique_groups:", unique_groups)
|
| 640 |
+
|
| 641 |
+
length_groups = len(unique_groups)
|
| 642 |
+
|
| 643 |
+
if length_groups > 1:
|
| 644 |
+
comprehensive_summary_format_prompt = comprehensive_summary_format_prompt_by_group
|
| 645 |
+
else:
|
| 646 |
+
comprehensive_summary_format_prompt = comprehensive_summary_format_prompt
|
| 647 |
|
| 648 |
model_choice_clean = model_name_map[model_choice]
|
| 649 |
model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
|
| 650 |
file_name = re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name).group(1) if re.search(r'(.*?)(?:_all_|_final_|_batch_|_col_)', table_file_name) else table_file_name
|
| 651 |
latest_batch_completed = int(re.search(r'batch_(\d+)_', table_file_name).group(1)) if 'batch_' in table_file_name else ""
|
| 652 |
batch_size_number = int(re.search(r'size_(\d+)_', table_file_name).group(1)) if 'size_' in table_file_name else ""
|
| 653 |
+
in_column_cleaned = re.search(r'col_(.*?)_unique', table_file_name).group(1) if 'col_' in table_file_name else ""
|
| 654 |
|
| 655 |
# Save outputs for each batch. If master file created, label file as master
|
| 656 |
if latest_batch_completed:
|
|
|
|
| 668 |
local_model, tokenizer = load_model(local_model_type=CHOSEN_LOCAL_MODEL_TYPE, repo_id=LOCAL_REPO_ID, model_filename=LOCAL_MODEL_FILE, model_dir=LOCAL_MODEL_FOLDER)
|
| 669 |
#print("Local model loaded:", local_model)
|
| 670 |
|
| 671 |
+
summary_loop = tqdm(unique_groups, desc="Creating summaries for groups", unit="groups")
|
|
|
|
| 672 |
|
| 673 |
if do_summaries == "Yes":
|
| 674 |
+
for summary_group in summary_loop:
|
| 675 |
|
| 676 |
+
print("Creating summary for group:", summary_group)
|
| 677 |
+
|
| 678 |
+
summary_text = topic_summary_df.loc[topic_summary_df["Group"]==summary_group].to_markdown(index=False)
|
| 679 |
|
| 680 |
formatted_summary_prompt = [summarise_everything_prompt.format(topic_summary_table=summary_text, summary_format=comprehensive_summary_format_prompt)]
|
| 681 |
|
| 682 |
+
formatted_summarise_everything_system_prompt = summarise_everything_system_prompt.format(column_name=chosen_cols[0],consultation_context=context_textbox)
|
| 683 |
+
|
| 684 |
try:
|
| 685 |
+
response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, formatted_summarise_everything_system_prompt, local_model)
|
| 686 |
summarised_output = response
|
| 687 |
summarised_output = re.sub(r'\n{2,}', '\n', summarised_output) # Replace multiple line breaks with a single line break
|
| 688 |
summarised_output = re.sub(r'^\n{1,}', '', summarised_output) # Remove one or more line breaks at the start
|
| 689 |
summarised_output = summarised_output.strip()
|
| 690 |
except Exception as e:
|
| 691 |
+
print("Cannot create overall summary for group:", summary_group, "due to:", e)
|
| 692 |
summarised_output = ""
|
| 693 |
|
| 694 |
summarised_outputs.append(summarised_output)
|
| 695 |
+
txt_summarised_outputs.append(f"""Group name: {summary_group}\n""" + summarised_output)
|
| 696 |
+
|
| 697 |
out_metadata.extend(metadata)
|
| 698 |
out_metadata_str = '. '.join(out_metadata)
|
| 699 |
|
| 700 |
latest_summary_completed += 1
|
| 701 |
|
| 702 |
+
summary_group_short = clean_column_name(summary_group)
|
|
|
|
|
|
|
| 703 |
|
| 704 |
+
# Write outputs
|
| 705 |
+
overall_summary_output_path = output_folder + batch_file_path_details + "_overall_summary_grp" + summary_group_short + "_" + model_choice_clean_short + "_temp_" + str(temperature) + ".txt"
|
|
|
|
| 706 |
|
| 707 |
+
# Write single group outputs
|
| 708 |
try:
|
| 709 |
with open(overall_summary_output_path, "w", encoding='utf-8', errors='replace') as f:
|
| 710 |
f.write(summarised_output)
|
| 711 |
+
# output_files.append(overall_summary_output_path)
|
| 712 |
except Exception as e:
|
| 713 |
print(f"Error writing prompt to file {overall_summary_output_path}: {e}")
|
| 714 |
|
| 715 |
+
# Write overall outputs to csv
|
| 716 |
+
overall_summary_output_csv_path = output_folder + batch_file_path_details + "_overall_summary_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
|
| 717 |
+
summarised_outputs_df = pd.DataFrame(data={"Group":unique_groups, "Summary":summarised_outputs})
|
| 718 |
+
summarised_outputs_df.to_csv(overall_summary_output_csv_path, index=None)
|
| 719 |
+
output_files.append(overall_summary_output_csv_path)
|
| 720 |
+
|
| 721 |
+
markdown_output_table = summarised_outputs_df.to_markdown(index=False)
|
| 722 |
+
|
| 723 |
+
# Text output file
|
| 724 |
+
summarised_outputs_join = "\n".join(txt_summarised_outputs)
|
| 725 |
+
overall_summary_output_txt_path = output_folder + batch_file_path_details + "_overall_summary_" + model_choice_clean_short + "_temp_" + str(temperature) + ".txt"
|
| 726 |
+
|
| 727 |
+
try:
|
| 728 |
+
with open(overall_summary_output_txt_path, "w", encoding='utf-8', errors='replace') as f:
|
| 729 |
+
f.write(summarised_outputs_join)
|
| 730 |
+
output_files.append(overall_summary_output_txt_path)
|
| 731 |
+
except Exception as e:
|
| 732 |
+
print(f"Error writing prompt to file {overall_summary_output_txt_path}: {e}")
|
| 733 |
+
|
| 734 |
+
output_files = list(set(output_files))
|
| 735 |
+
|
| 736 |
+
# Check if beyond max time allowed for processing and break if necessary
|
| 737 |
+
toc = time.perf_counter()
|
| 738 |
+
time_taken = toc - tic
|
| 739 |
+
|
| 740 |
+
print("All group summaries created. Time taken:", time_taken)
|
| 741 |
|
| 742 |
+
return output_files, markdown_output_table
|
tools/llm_api_call.py
CHANGED
|
@@ -186,7 +186,7 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
|
|
| 186 |
|
| 187 |
return simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_data
|
| 188 |
|
| 189 |
-
def replace_punctuation_with_underscore(input_string):
|
| 190 |
# Create a translation table where each punctuation character maps to '_'
|
| 191 |
translation_table = str.maketrans(string.punctuation, '_' * len(string.punctuation))
|
| 192 |
|
|
|
|
| 186 |
|
| 187 |
return simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_data
|
| 188 |
|
| 189 |
+
def replace_punctuation_with_underscore(input_string:str):
|
| 190 |
# Create a translation table where each punctuation character maps to '_'
|
| 191 |
translation_table = str.maketrans(string.punctuation, '_' * len(string.punctuation))
|
| 192 |
|
tools/prompts.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
| 2 |
|
| 3 |
initial_table_prompt = """The open text data is shown in the following table that contains two columns, Reference and Response. Response table:
|
| 4 |
{response_table}
|
|
@@ -48,6 +50,10 @@ New table:"""
|
|
| 48 |
|
| 49 |
# Return only one table in markdown format containing all relevant topics. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
summarise_topic_descriptions_system_prompt = system_prompt
|
| 52 |
|
| 53 |
summarise_topic_descriptions_prompt = """Below is a table with number of paragraphs related to the data from the open text column:
|
|
@@ -62,36 +68,33 @@ single_para_summary_format_prompt = "Return a concise summary up to one paragrap
|
|
| 62 |
|
| 63 |
two_para_summary_format_prompt = "Return a summary up to two paragraphs long that includes as much detail as possible from the original text"
|
| 64 |
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
-
|
| 67 |
|
| 68 |
-
|
| 69 |
|
| 70 |
-
|
| 71 |
-
{topics}
|
| 72 |
|
| 73 |
-
Your task is to
|
| 74 |
|
| 75 |
-
|
| 76 |
|
|
|
|
| 77 |
|
| 78 |
-
|
| 79 |
|
| 80 |
-
summarise_everything_prompt = """Below is a table that gives an overview of the main topics from a dataset of open text responses along with a description of each topic, and the number of responses that mentioned each topic:
|
| 81 |
|
| 82 |
-
'{topic_summary_table}'
|
| 83 |
|
| 84 |
-
Your task is to summarise the above table in markdown format. {summary_format}. Return only the summary in markdown format and no other text.
|
| 85 |
|
| 86 |
-
|
|
|
|
|
|
|
| 87 |
|
| 88 |
-
comprehensive_summary_format_prompt = "Return a comprehensive summary that covers all the important topics and themes described in the table. If there are different values in the Group column of the data, compare and contrast differences between the topics and themes from each Group."
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
### Verify exisiting categories prompt
|
| 92 |
verify_titles_system_prompt = system_prompt
|
| 93 |
|
| 94 |
-
|
| 95 |
verify_titles_prompt = """Response numbers alongside the Response text and assigned descriptions are shown in the table below:
|
| 96 |
{response_table}
|
| 97 |
|
|
@@ -107,6 +110,16 @@ Do not add any other text to your response.
|
|
| 107 |
Output markdown table:"""
|
| 108 |
|
| 109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
# example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
|
| 112 |
# You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
|
|
|
|
| 1 |
+
generic_system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset."""
|
| 2 |
+
|
| 3 |
+
system_prompt = """You are a researcher analysing responses from an open text dataset. You are analysing a single column from this dataset called '{column_name}'. The context of this analysis is '{consultation_context}'."""
|
| 4 |
|
| 5 |
initial_table_prompt = """The open text data is shown in the following table that contains two columns, Reference and Response. Response table:
|
| 6 |
{response_table}
|
|
|
|
| 50 |
|
| 51 |
# Return only one table in markdown format containing all relevant topics. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
|
| 52 |
|
| 53 |
+
###
|
| 54 |
+
# SUMMARISE TOPICS PROMPT
|
| 55 |
+
###
|
| 56 |
+
|
| 57 |
summarise_topic_descriptions_system_prompt = system_prompt
|
| 58 |
|
| 59 |
summarise_topic_descriptions_prompt = """Below is a table with number of paragraphs related to the data from the open text column:
|
|
|
|
| 68 |
|
| 69 |
two_para_summary_format_prompt = "Return a summary up to two paragraphs long that includes as much detail as possible from the original text"
|
| 70 |
|
| 71 |
+
###
|
| 72 |
+
# OVERALL SUMMARY PROMPTS
|
| 73 |
+
###
|
| 74 |
|
| 75 |
+
summarise_everything_system_prompt = generic_system_prompt
|
| 76 |
|
| 77 |
+
summarise_everything_prompt = """Below is a table that gives an overview of the main topics from a dataset of open text responses along with a description of each topic, and the number of responses that mentioned each topic:
|
| 78 |
|
| 79 |
+
'{topic_summary_table}'
|
|
|
|
| 80 |
|
| 81 |
+
Your task is to summarise the above table. {summary_format}. Return only the summary and no other text.
|
| 82 |
|
| 83 |
+
Summary:"""
|
| 84 |
|
| 85 |
+
comprehensive_summary_format_prompt = "Return a comprehensive summary that covers all the important topics and themes described in the table. Structure the summary with General Topics as headings, with significant Subtopics described in bullet points below them in order of relative significance. Do not explicitly mention the Sentiment, Number of responses, or Group values. Do not use the words 'General Topic' or 'Subtopic' directly in the summary."
|
| 86 |
|
| 87 |
+
comprehensive_summary_format_prompt_by_group = "Return a comprehensive summary that covers all the important topics and themes described in the table. Structure the summary with General Topics as headings, with significant Subtopics described in bullet points below them in order of relative significance. Do not explicitly mention the Sentiment, Number of responses, or Group values. Do not use the words 'General Topic' or 'Subtopic' directly in the summary. Compare and contrast differences between the topics and themes from each Group."
|
| 88 |
|
|
|
|
| 89 |
|
|
|
|
| 90 |
|
|
|
|
| 91 |
|
| 92 |
+
###
|
| 93 |
+
# VERIFY EXISTING DESCRIPTIONS/TITLES
|
| 94 |
+
###
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
verify_titles_system_prompt = system_prompt
|
| 97 |
|
|
|
|
| 98 |
verify_titles_prompt = """Response numbers alongside the Response text and assigned descriptions are shown in the table below:
|
| 99 |
{response_table}
|
| 100 |
|
|
|
|
| 110 |
Output markdown table:"""
|
| 111 |
|
| 112 |
|
| 113 |
+
## The following didn't work well in testing and so is not currently used
|
| 114 |
+
|
| 115 |
+
create_general_topics_system_prompt = system_prompt
|
| 116 |
+
|
| 117 |
+
create_general_topics_prompt = """Subtopics known to be relevant to this dataset are shown in the following Topics table:
|
| 118 |
+
{topics}
|
| 119 |
+
|
| 120 |
+
Your task is to create a General Topic name for each Subtopic. The new Topics table should have the columns 'General Topic' and 'Subtopic' only. Write a 'General Topic' text label relevant to the Subtopic next to it in the new table. The text label should describe the general theme of the Subtopic. Do not add any other text, thoughts, or notes to your response.
|
| 121 |
+
|
| 122 |
+
New Topics table:"""
|
| 123 |
|
| 124 |
# example_instruction_prompt_llama3 = """<|start_header_id|>system<|end_header_id|>\n
|
| 125 |
# You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
|