Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
99b54b3
1
Parent(s):
7122331
Package updates. Can now ask the model to only assign specified topics
Browse files- .dockerignore +1 -0
- .gitignore +1 -0
- app.py +53 -36
- requirements.txt +2 -2
- requirements_aws.txt +2 -2
- requirements_gpu.txt +4 -3
- tools/aws_functions.py +6 -1
- tools/helper_functions.py +155 -12
- tools/llm_api_call.py +202 -300
- tools/prompts.py +12 -7
.dockerignore
CHANGED
@@ -5,6 +5,7 @@
|
|
5 |
*.ipynb
|
6 |
*.xls
|
7 |
*.xlsx
|
|
|
8 |
examples/*
|
9 |
output/*
|
10 |
tools/__pycache__/*
|
|
|
5 |
*.ipynb
|
6 |
*.xls
|
7 |
*.xlsx
|
8 |
+
*.csv
|
9 |
examples/*
|
10 |
output/*
|
11 |
tools/__pycache__/*
|
.gitignore
CHANGED
@@ -5,6 +5,7 @@
|
|
5 |
*.ipynb
|
6 |
*.xls
|
7 |
*.xlsx
|
|
|
8 |
examples/*
|
9 |
output/*
|
10 |
tools/__pycache__/*
|
|
|
5 |
*.ipynb
|
6 |
*.xls
|
7 |
*.xlsx
|
8 |
+
*.csv
|
9 |
examples/*
|
10 |
output/*
|
11 |
tools/__pycache__/*
|
app.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
import os
|
2 |
import socket
|
3 |
import spaces
|
4 |
-
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL
|
5 |
from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
|
6 |
from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default, deduplicate_topics, modify_existing_output_tables
|
7 |
from tools.auth import authenticate_user
|
8 |
-
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, verify_titles_prompt, verify_titles_system_prompt
|
9 |
from tools.verify_titles import verify_titles
|
10 |
#from tools.aws_functions import load_data_from_aws
|
11 |
import gradio as gr
|
@@ -22,18 +22,14 @@ host_name = socket.gethostname()
|
|
22 |
access_logs_data_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
23 |
feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
24 |
usage_data_folder = 'usage/' + today_rev + '/' + host_name + '/'
|
25 |
-
file_input_height =
|
26 |
-
|
27 |
-
print("RUN_LOCAL_MODEL is:", RUN_LOCAL_MODEL)
|
28 |
|
29 |
if RUN_LOCAL_MODEL == "1":
|
30 |
default_model_choice = "gemma_2b_it_local"
|
31 |
-
|
32 |
elif RUN_AWS_FUNCTIONS == "1":
|
33 |
default_model_choice = "anthropic.claude-3-haiku-20240307-v1:0"
|
34 |
-
|
35 |
else:
|
36 |
-
default_model_choice = "gemini-2.0-flash"
|
37 |
|
38 |
# Create the gradio interface
|
39 |
app = gr.Blocks(theme = gr.themes.Base())
|
@@ -95,7 +91,7 @@ with app:
|
|
95 |
|
96 |
Instructions on use can be found in the README.md file. Try it out with this [dummy development consultation dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation), which you can also try with [zero-shot topics](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation/blob/main/example_zero_shot.csv), or this [dummy case notes dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_case_notes).
|
97 |
|
98 |
-
You can use an AWS Bedrock model (Claude 3, paid), or Gemini (a free API, but with strict limits for the Pro model).
|
99 |
|
100 |
NOTE: that **API calls to Gemini are not considered secure**, so please only submit redacted, non-sensitive tabular files to this source. Also, large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
|
101 |
|
@@ -107,7 +103,7 @@ with app:
|
|
107 |
)
|
108 |
with gr.Row():
|
109 |
model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
|
110 |
-
in_api_key = gr.Textbox(value =
|
111 |
|
112 |
with gr.Accordion("Upload xlsx or csv file", open = True):
|
113 |
in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
@@ -116,12 +112,14 @@ with app:
|
|
116 |
in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
|
117 |
|
118 |
with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
|
119 |
-
candidate_topics = gr.File(height=file_input_height, label="Input topics from file (csv). File should have at least one column with a header, and all topic names below this. Using the headers 'General Topic' and/or 'Subtopic' will allow for these columns to be suggested to the model.")
|
120 |
-
|
|
|
|
|
121 |
|
122 |
context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
|
123 |
|
124 |
-
sentiment_checkbox = gr.Radio(label="Choose sentiment categories to split responses", value="Negative
|
125 |
|
126 |
extract_topics_btn = gr.Button("Extract topics", variant="primary")
|
127 |
|
@@ -153,10 +151,7 @@ with app:
|
|
153 |
|
154 |
save_modified_files_button = gr.Button(value="Save modified topic names")
|
155 |
|
156 |
-
|
157 |
-
with gr.Accordion("Upload reference data file and unique data files", open = True):
|
158 |
-
|
159 |
-
|
160 |
### DEDUPLICATION
|
161 |
deduplication_input_files = gr.File(height=file_input_height, label="Upload files to deduplicate topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
162 |
deduplication_input_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
|
@@ -168,11 +163,10 @@ with app:
|
|
168 |
|
169 |
deduplicate_previous_data_btn = gr.Button("Deduplicate topics", variant="primary")
|
170 |
|
171 |
-
|
172 |
-
### SUMMARISATION
|
173 |
summarisation_input_files = gr.File(height=file_input_height, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
174 |
|
175 |
-
summarise_format_radio = gr.Radio(label="Choose summary type", value=
|
176 |
|
177 |
summarise_previous_data_btn = gr.Button("Summarise topics", variant="primary")
|
178 |
summary_output_files = gr.File(height=file_input_height, label="Summarised output files", interactive=False)
|
@@ -198,10 +192,10 @@ with app:
|
|
198 |
in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
|
199 |
view_table_markdown = gr.Markdown(value = "", label="View table", show_copy_button=True)
|
200 |
|
201 |
-
with gr.Tab(label="Verify
|
202 |
gr.Markdown(
|
203 |
"""
|
204 |
-
### Choose a tabular data file (xlsx or csv) with titles and original text to verify
|
205 |
"""
|
206 |
)
|
207 |
with gr.Row():
|
@@ -212,11 +206,11 @@ with app:
|
|
212 |
verify_in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
213 |
|
214 |
verify_in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
|
215 |
-
verify_in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = True, label="Select the open text columns that have a response and a title. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
|
216 |
#verify_title_colnames = gr.Dropdown(choices=["Choose column with titles"], multiselect = False, label="Select the open text columns that have a title. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
|
217 |
|
218 |
-
verify_titles_btn = gr.Button("Verify
|
219 |
-
verify_titles_file_output = gr.File(height=file_input_height, label="
|
220 |
verify_display_topic_table_markdown = gr.Markdown(value="### Language model response will appear here", show_copy_button=True)
|
221 |
|
222 |
verify_modification_input_files_placeholder = gr.File(height=file_input_height, label="Placeholder for files to avoid errors", visible=False)
|
@@ -231,7 +225,7 @@ with app:
|
|
231 |
batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = batch_size_default, precision=0, minimum=1, maximum=100)
|
232 |
random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
|
233 |
|
234 |
-
with gr.Accordion("Prompt settings", open =
|
235 |
number_of_prompts = gr.Number(value=1, label="Number of prompts to send to LLM in sequence", minimum=1, maximum=3, visible=False)
|
236 |
system_prompt_textbox = gr.Textbox(label="Initial system prompt", lines = 4, value = system_prompt)
|
237 |
initial_table_prompt_textbox = gr.Textbox(label = "Initial topics prompt", lines = 8, value = initial_table_prompt)
|
@@ -241,9 +235,17 @@ with app:
|
|
241 |
add_to_existing_topics_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = add_existing_topics_prompt)
|
242 |
verify_titles_system_prompt_textbox = gr.Textbox(label="Additional topics system prompt", lines = 4, value = verify_titles_system_prompt)
|
243 |
verify_titles_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = verify_titles_prompt)
|
244 |
-
|
245 |
-
|
246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
|
248 |
# Invisible text box to hold the session hash/username just for logging purposes
|
249 |
session_hash_textbox = gr.Textbox(label = "Session hash", value="", visible=False)
|
@@ -271,25 +273,28 @@ with app:
|
|
271 |
###
|
272 |
|
273 |
# Tabular data upload
|
274 |
-
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, reference_data_file_name_textbox])
|
275 |
|
276 |
extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown]).\
|
277 |
success(load_in_data_file,
|
278 |
inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
|
279 |
success(fn=extract_topics,
|
280 |
-
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio, in_excel_sheets],
|
281 |
-
outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files], api_name="extract_topics")
|
282 |
|
283 |
|
284 |
# If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
|
285 |
# latest_batch_completed.change(fn=extract_topics,
|
286 |
# inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio, in_excel_sheets],
|
287 |
-
# outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files]).\
|
288 |
# success(fn = reveal_feedback_buttons,
|
289 |
# outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
|
290 |
|
291 |
# If you upload data into the deduplication input box, the modifiable topic dataframe box is updated
|
292 |
modification_input_files.change(fn=load_in_previous_data_files, inputs=[modification_input_files, modified_unique_table_change_bool], outputs=[modifiable_unique_topics_df_state, master_modify_reference_df_state, master_modify_unique_topics_df_state, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, text_output_modify_file_list_state])
|
|
|
|
|
|
|
293 |
|
294 |
|
295 |
# Modify output table with custom topic names
|
@@ -314,17 +319,29 @@ with app:
|
|
314 |
load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches]).\
|
315 |
success(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, reference_data_file_name_textbox])
|
316 |
|
317 |
-
# VERIFY
|
318 |
|
319 |
# Tabular data upload
|
320 |
-
verify_in_data_files.upload(fn=put_columns_in_df, inputs=[verify_in_data_files], outputs=[verify_in_colnames, verify_in_excel_sheets, reference_data_file_name_textbox])
|
321 |
|
322 |
verify_titles_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown]).\
|
323 |
success(load_in_data_file,
|
324 |
inputs = [verify_in_data_files, verify_in_colnames, batch_size_number, verify_in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="verify_load_data").\
|
325 |
success(fn=verify_titles,
|
326 |
inputs=[verify_in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, verify_in_api_key, temperature_slide, verify_in_colnames, verify_model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, verify_titles_prompt_textbox, prompt_2_textbox, prompt_3_textbox, verify_titles_system_prompt_textbox, verify_titles_system_prompt_textbox, verify_titles_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio, in_excel_sheets],
|
327 |
-
outputs=[verify_display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, verify_titles_file_output, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, verify_modification_input_files_placeholder], api_name="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
|
329 |
###
|
330 |
# LOGGING AND ON APP LOAD FUNCTIONS
|
|
|
1 |
import os
|
2 |
import socket
|
3 |
import spaces
|
4 |
+
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, model_full_names, view_table, empty_output_vars_extract_topics, empty_output_vars_summarise, RUN_LOCAL_MODEL, load_in_previous_reference_file, join_cols_onto_reference_df, GEMINI_API_KEY
|
5 |
from tools.aws_functions import upload_file_to_s3, RUN_AWS_FUNCTIONS
|
6 |
from tools.llm_api_call import extract_topics, load_in_data_file, load_in_previous_data_files, sample_reference_table_summaries, summarise_output_topics, batch_size_default, deduplicate_topics, modify_existing_output_tables
|
7 |
from tools.auth import authenticate_user
|
8 |
+
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, verify_titles_prompt, verify_titles_system_prompt, two_para_summary_format_prompt, single_para_summary_format_prompt
|
9 |
from tools.verify_titles import verify_titles
|
10 |
#from tools.aws_functions import load_data_from_aws
|
11 |
import gradio as gr
|
|
|
22 |
access_logs_data_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
23 |
feedback_data_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
24 |
usage_data_folder = 'usage/' + today_rev + '/' + host_name + '/'
|
25 |
+
file_input_height = 200
|
|
|
|
|
26 |
|
27 |
if RUN_LOCAL_MODEL == "1":
|
28 |
default_model_choice = "gemma_2b_it_local"
|
|
|
29 |
elif RUN_AWS_FUNCTIONS == "1":
|
30 |
default_model_choice = "anthropic.claude-3-haiku-20240307-v1:0"
|
|
|
31 |
else:
|
32 |
+
default_model_choice = "gemini-2.0-flash-001"
|
33 |
|
34 |
# Create the gradio interface
|
35 |
app = gr.Blocks(theme = gr.themes.Base())
|
|
|
91 |
|
92 |
Instructions on use can be found in the README.md file. Try it out with this [dummy development consultation dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation), which you can also try with [zero-shot topics](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation/blob/main/example_zero_shot.csv), or this [dummy case notes dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_case_notes).
|
93 |
|
94 |
+
You can use an AWS Bedrock model (Claude 3, paid), or Gemini (a free API, but with strict limits for the Pro model). The use of Gemini models requires an API key. To set up your own Gemini API key, go [here](https://aistudio.google.com/app/u/1/plan_information).
|
95 |
|
96 |
NOTE: that **API calls to Gemini are not considered secure**, so please only submit redacted, non-sensitive tabular files to this source. Also, large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
|
97 |
|
|
|
103 |
)
|
104 |
with gr.Row():
|
105 |
model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model to use", multiselect=False)
|
106 |
+
in_api_key = gr.Textbox(value = GEMINI_API_KEY, label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
|
107 |
|
108 |
with gr.Accordion("Upload xlsx or csv file", open = True):
|
109 |
in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
|
|
112 |
in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = False, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
|
113 |
|
114 |
with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
|
115 |
+
candidate_topics = gr.File(height=file_input_height, label="Input topics from file (csv). File should have at least one column with a header, and all topic names below this. Using the headers 'General Topic' and/or 'Subtopic' will allow for these columns to be suggested to the model. If a third column is present, it will be assumed to be a topic description.")
|
116 |
+
with gr.Row(equal_height=True):
|
117 |
+
force_zero_shot_radio = gr.Radio(label="Force responses into zero shot topics", value="No", choices=["Yes", "No"])
|
118 |
+
force_single_topic_radio = gr.Radio(label="Ask the model to assign responses to only a single topic", value="No", choices=["Yes", "No"])
|
119 |
|
120 |
context_textbox = gr.Textbox(label="Write up to one sentence giving context to the large language model for your task (e.g. 'Consultation for the construction of flats on Main Street')")
|
121 |
|
122 |
+
sentiment_checkbox = gr.Radio(label="Choose sentiment categories to split responses", value="Negative or Positive", choices=["Negative or Positive", "Negative, Neutral, or Positive", "Do not assess sentiment"])
|
123 |
|
124 |
extract_topics_btn = gr.Button("Extract topics", variant="primary")
|
125 |
|
|
|
151 |
|
152 |
save_modified_files_button = gr.Button(value="Save modified topic names")
|
153 |
|
154 |
+
with gr.Accordion("Upload reference data file and unique data files", open = True):
|
|
|
|
|
|
|
155 |
### DEDUPLICATION
|
156 |
deduplication_input_files = gr.File(height=file_input_height, label="Upload files to deduplicate topics", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
157 |
deduplication_input_files_status = gr.Textbox(value = "", label="Previous file input", visible=False)
|
|
|
163 |
|
164 |
deduplicate_previous_data_btn = gr.Button("Deduplicate topics", variant="primary")
|
165 |
|
166 |
+
### SUMMARISATION
|
|
|
167 |
summarisation_input_files = gr.File(height=file_input_height, label="Upload files to summarise", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
168 |
|
169 |
+
summarise_format_radio = gr.Radio(label="Choose summary type", value=two_para_summary_format_prompt, choices=[two_para_summary_format_prompt, single_para_summary_format_prompt])
|
170 |
|
171 |
summarise_previous_data_btn = gr.Button("Summarise topics", variant="primary")
|
172 |
summary_output_files = gr.File(height=file_input_height, label="Summarised output files", interactive=False)
|
|
|
192 |
in_view_table = gr.File(height=file_input_height, label="Choose unique topic csv files", file_count= "single", file_types=['.csv', '.parquet', '.csv.gz'])
|
193 |
view_table_markdown = gr.Markdown(value = "", label="View table", show_copy_button=True)
|
194 |
|
195 |
+
with gr.Tab(label="Verify descriptions"):
|
196 |
gr.Markdown(
|
197 |
"""
|
198 |
+
### Choose a tabular data file (xlsx or csv) with titles and original text to verify descriptions for.
|
199 |
"""
|
200 |
)
|
201 |
with gr.Row():
|
|
|
206 |
verify_in_data_files = gr.File(height=file_input_height, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
|
207 |
|
208 |
verify_in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheet"], multiselect = False, label="Select the Excel sheet.", visible=False, allow_custom_value=True)
|
209 |
+
verify_in_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = True, label="Select the open text columns that have a response and a title/description. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
|
210 |
#verify_title_colnames = gr.Dropdown(choices=["Choose column with titles"], multiselect = False, label="Select the open text columns that have a title. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
|
211 |
|
212 |
+
verify_titles_btn = gr.Button("Verify descriptions", variant="primary")
|
213 |
+
verify_titles_file_output = gr.File(height=file_input_height, label="Descriptions verification output files")
|
214 |
verify_display_topic_table_markdown = gr.Markdown(value="### Language model response will appear here", show_copy_button=True)
|
215 |
|
216 |
verify_modification_input_files_placeholder = gr.File(height=file_input_height, label="Placeholder for files to avoid errors", visible=False)
|
|
|
225 |
batch_size_number = gr.Number(label = "Number of responses to submit in a single LLM query", value = batch_size_default, precision=0, minimum=1, maximum=100)
|
226 |
random_seed = gr.Number(value=42, label="Random seed for LLM generation", visible=False)
|
227 |
|
228 |
+
with gr.Accordion("Prompt settings", open = False):
|
229 |
number_of_prompts = gr.Number(value=1, label="Number of prompts to send to LLM in sequence", minimum=1, maximum=3, visible=False)
|
230 |
system_prompt_textbox = gr.Textbox(label="Initial system prompt", lines = 4, value = system_prompt)
|
231 |
initial_table_prompt_textbox = gr.Textbox(label = "Initial topics prompt", lines = 8, value = initial_table_prompt)
|
|
|
235 |
add_to_existing_topics_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = add_existing_topics_prompt)
|
236 |
verify_titles_system_prompt_textbox = gr.Textbox(label="Additional topics system prompt", lines = 4, value = verify_titles_system_prompt)
|
237 |
verify_titles_prompt_textbox = gr.Textbox(label = "Additional topics prompt", lines = 8, value = verify_titles_prompt)
|
238 |
+
|
239 |
+
with gr.Accordion("Join additional columns to reference file outputs", open = False):
|
240 |
+
join_colnames = gr.Dropdown(choices=["Choose column with responses"], multiselect = True, label="Select the open text column of interest. In an Excel file, this shows columns across all sheets.", allow_custom_value=True, interactive=True)
|
241 |
+
with gr.Row():
|
242 |
+
in_join_files = gr.File(height=file_input_height, label="Reference file should go here. Original data file should be loaded on the first tab.")
|
243 |
+
join_cols_btn = gr.Button("Join columns to reference output", variant="primary")
|
244 |
+
out_join_files = gr.File(height=file_input_height, label="Output joined reference files will go here.")
|
245 |
+
|
246 |
+
with gr.Accordion("Logging outputs", open = False):
|
247 |
+
log_files_output = gr.File(height=file_input_height, label="Log file output", interactive=False)
|
248 |
+
conversation_metadata_textbox = gr.Textbox(label="Query metadata - usage counts and other parameters", interactive=False, lines=8)
|
249 |
|
250 |
# Invisible text box to hold the session hash/username just for logging purposes
|
251 |
session_hash_textbox = gr.Textbox(label = "Session hash", value="", visible=False)
|
|
|
273 |
###
|
274 |
|
275 |
# Tabular data upload
|
276 |
+
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, reference_data_file_name_textbox, join_colnames])
|
277 |
|
278 |
extract_topics_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown]).\
|
279 |
success(load_in_data_file,
|
280 |
inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="load_data").\
|
281 |
success(fn=extract_topics,
|
282 |
+
inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio, in_excel_sheets, force_single_topic_radio],
|
283 |
+
outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files, in_join_files], api_name="extract_topics")
|
284 |
|
285 |
|
286 |
# If the output file count text box changes, keep going with redacting each data file until done. Then reveal the feedback buttons.
|
287 |
# latest_batch_completed.change(fn=extract_topics,
|
288 |
# inputs=[in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, in_api_key, temperature_slide, in_colnames, model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, second_loop_state, conversation_metadata_textbox, initial_table_prompt_textbox, prompt_2_textbox, prompt_3_textbox, system_prompt_textbox, add_to_existing_topics_system_prompt_textbox, add_to_existing_topics_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio, in_excel_sheets],
|
289 |
+
# outputs=[display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, modification_input_files, in_join_files]).\
|
290 |
# success(fn = reveal_feedback_buttons,
|
291 |
# outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title], scroll_to_output=True)
|
292 |
|
293 |
# If you upload data into the deduplication input box, the modifiable topic dataframe box is updated
|
294 |
modification_input_files.change(fn=load_in_previous_data_files, inputs=[modification_input_files, modified_unique_table_change_bool], outputs=[modifiable_unique_topics_df_state, master_modify_reference_df_state, master_modify_unique_topics_df_state, reference_data_file_name_textbox, unique_topics_table_file_name_textbox, text_output_modify_file_list_state])
|
295 |
+
|
296 |
+
|
297 |
+
|
298 |
|
299 |
|
300 |
# Modify output table with custom topic names
|
|
|
319 |
load_in_data_file, inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches]).\
|
320 |
success(load_in_previous_data_files, inputs=[in_previous_data_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed, in_previous_data_files_status, reference_data_file_name_textbox])
|
321 |
|
322 |
+
# VERIFY DESCRIPTIONS OF TEXT
|
323 |
|
324 |
# Tabular data upload
|
325 |
+
verify_in_data_files.upload(fn=put_columns_in_df, inputs=[verify_in_data_files], outputs=[verify_in_colnames, verify_in_excel_sheets, reference_data_file_name_textbox, join_colnames])
|
326 |
|
327 |
verify_titles_btn.click(fn=empty_output_vars_extract_topics, inputs=None, outputs=[master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, topic_extraction_output_files, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, file_data_state, reference_data_file_name_textbox, display_topic_table_markdown]).\
|
328 |
success(load_in_data_file,
|
329 |
inputs = [verify_in_data_files, verify_in_colnames, batch_size_number, verify_in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches], api_name="verify_load_data").\
|
330 |
success(fn=verify_titles,
|
331 |
inputs=[verify_in_data_files, file_data_state, master_topic_df_state, master_reference_df_state, master_unique_topics_df_state, display_topic_table_markdown, reference_data_file_name_textbox, total_number_of_batches, verify_in_api_key, temperature_slide, verify_in_colnames, verify_model_choice, candidate_topics, latest_batch_completed, display_topic_table_markdown, text_output_file_list_state, log_files_output_list_state, first_loop_state, conversation_metadata_textbox, verify_titles_prompt_textbox, prompt_2_textbox, prompt_3_textbox, verify_titles_system_prompt_textbox, verify_titles_system_prompt_textbox, verify_titles_prompt_textbox, number_of_prompts, batch_size_number, context_textbox, estimated_time_taken_number, sentiment_checkbox, force_zero_shot_radio, in_excel_sheets],
|
332 |
+
outputs=[verify_display_topic_table_markdown, master_topic_df_state, master_unique_topics_df_state, master_reference_df_state, verify_titles_file_output, text_output_file_list_state, latest_batch_completed, log_files_output, log_files_output_list_state, conversation_metadata_textbox, estimated_time_taken_number, deduplication_input_files, summarisation_input_files, modifiable_unique_topics_df_state, verify_modification_input_files_placeholder], api_name="verify_descriptions")
|
333 |
+
|
334 |
+
###
|
335 |
+
# LLM SETTINGS PAGE
|
336 |
+
###
|
337 |
+
|
338 |
+
reference_df_data_file_name_textbox = gr.Textbox(label="reference_df_data_file_name_textbox", visible=False)
|
339 |
+
master_reference_df_state_joined = gr.State(pd.DataFrame())
|
340 |
+
|
341 |
+
join_cols_btn.click(fn=load_in_previous_reference_file, inputs=[in_join_files], outputs=[master_reference_df_state, reference_df_data_file_name_textbox]).\
|
342 |
+
success(load_in_data_file,
|
343 |
+
inputs = [in_data_files, in_colnames, batch_size_number, in_excel_sheets], outputs = [file_data_state, reference_data_file_name_textbox, total_number_of_batches]).\
|
344 |
+
success(fn=join_cols_onto_reference_df, inputs=[master_reference_df_state, file_data_state, join_colnames, reference_df_data_file_name_textbox], outputs=[master_reference_df_state_joined, out_join_files])
|
345 |
|
346 |
###
|
347 |
# LOGGING AND ON APP LOAD FUNCTIONS
|
requirements.txt
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
pandas==2.2.3
|
2 |
-
gradio==5.
|
3 |
spaces==0.34.1
|
4 |
-
boto3==1.
|
5 |
pyarrow==19.0.1
|
6 |
openpyxl==3.1.3
|
7 |
markdown==3.7
|
|
|
1 |
pandas==2.2.3
|
2 |
+
gradio==5.32.0
|
3 |
spaces==0.34.1
|
4 |
+
boto3==1.38.5
|
5 |
pyarrow==19.0.1
|
6 |
openpyxl==3.1.3
|
7 |
markdown==3.7
|
requirements_aws.txt
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
pandas==2.2.3
|
2 |
-
gradio==5.
|
3 |
spaces==0.34.1
|
4 |
-
boto3==1.
|
5 |
pyarrow==19.0.1
|
6 |
openpyxl==3.1.3
|
7 |
markdown==3.7
|
|
|
1 |
pandas==2.2.3
|
2 |
+
gradio==5.32.0
|
3 |
spaces==0.34.1
|
4 |
+
boto3==1.38.5
|
5 |
pyarrow==19.0.1
|
6 |
openpyxl==3.1.3
|
7 |
markdown==3.7
|
requirements_gpu.txt
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
pandas==2.2.3
|
2 |
-
gradio==5.
|
3 |
spaces==0.34.1
|
4 |
-
boto3==1.
|
5 |
pyarrow==19.0.1
|
6 |
openpyxl==3.1.3
|
7 |
markdown==3.7
|
@@ -14,7 +14,8 @@ rapidfuzz==3.10.1
|
|
14 |
torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu121
|
15 |
#llama-cpp-python==0.3.4 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
16 |
# Specify exact llama_cpp wheel for huggingface compatibility
|
17 |
-
https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu121/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl
|
|
|
18 |
transformers==4.51.1
|
19 |
numpy==1.26.4
|
20 |
typing_extensions==4.12.2
|
|
|
1 |
pandas==2.2.3
|
2 |
+
gradio==5.32.0
|
3 |
spaces==0.34.1
|
4 |
+
boto3==1.38.5
|
5 |
pyarrow==19.0.1
|
6 |
openpyxl==3.1.3
|
7 |
markdown==3.7
|
|
|
14 |
torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu121
|
15 |
#llama-cpp-python==0.3.4 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
16 |
# Specify exact llama_cpp wheel for huggingface compatibility
|
17 |
+
#https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu121/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl
|
18 |
+
https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu121/llama_cpp_python-0.3.4-cp311-cp311-win_amd64.whl # Windows
|
19 |
transformers==4.51.1
|
20 |
numpy==1.26.4
|
21 |
typing_extensions==4.12.2
|
tools/aws_functions.py
CHANGED
@@ -13,9 +13,14 @@ bucket_name=""
|
|
13 |
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
|
14 |
print(f'The value of AWS_REGION is {AWS_REGION}')
|
15 |
|
|
|
|
|
|
|
|
|
|
|
16 |
if RUN_AWS_FUNCTIONS == "1":
|
17 |
try:
|
18 |
-
bucket_name =
|
19 |
session = boto3.Session() # profile_name="default"
|
20 |
except Exception as e:
|
21 |
print(e)
|
|
|
13 |
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
|
14 |
print(f'The value of AWS_REGION is {AWS_REGION}')
|
15 |
|
16 |
+
CONSULTATION_SUMMARY_BUCKET = get_or_create_env_var('CONSULTATION_SUMMARY_BUCKET', '')
|
17 |
+
print(f'The value of AWS_REGION is {CONSULTATION_SUMMARY_BUCKET}')
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
if RUN_AWS_FUNCTIONS == "1":
|
22 |
try:
|
23 |
+
bucket_name = CONSULTATION_SUMMARY_BUCKET
|
24 |
session = boto3.Session() # profile_name="default"
|
25 |
except Exception as e:
|
26 |
print(e)
|
tools/helper_functions.py
CHANGED
@@ -1,6 +1,9 @@
|
|
1 |
import os
|
|
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
|
|
|
|
4 |
|
5 |
def empty_output_vars_extract_topics():
|
6 |
# Empty output objects before processing a new file
|
@@ -46,22 +49,35 @@ def get_or_create_env_var(var_name, default_value):
|
|
46 |
|
47 |
return value
|
48 |
|
49 |
-
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "
|
50 |
print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
|
51 |
|
52 |
RUN_LOCAL_MODEL = get_or_create_env_var("RUN_LOCAL_MODEL", "1")
|
53 |
print(f'The value of RUN_LOCAL_MODEL is {RUN_LOCAL_MODEL}')
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
if RUN_AWS_FUNCTIONS == "1":
|
56 |
-
model_full_names
|
57 |
-
model_short_names
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
model_short_names.remove("gemma_local")
|
65 |
|
66 |
model_name_map = {short: full for short, full in zip(model_full_names, model_short_names)}
|
67 |
|
@@ -123,6 +139,113 @@ def read_file(filename:str, sheet:str=""):
|
|
123 |
elif file_type == 'parquet':
|
124 |
return pd.read_parquet(filename)
|
125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
# Wrap text in each column to the specified max width, including whole words
|
127 |
def wrap_text(text:str, max_width=60, max_text_length=None):
|
128 |
if not isinstance(text, str):
|
@@ -209,6 +332,26 @@ def wrap_text(text:str, max_width=60, max_text_length=None):
|
|
209 |
|
210 |
return '<br>'.join(wrapped_lines)
|
211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
|
213 |
def view_table(file_path: str): # Added max_width parameter
|
214 |
df = pd.read_csv(file_path)
|
@@ -234,7 +377,7 @@ def ensure_output_folder_exists():
|
|
234 |
else:
|
235 |
print(f"The 'output/' folder already exists.")
|
236 |
|
237 |
-
def put_columns_in_df(in_file):
|
238 |
new_choices = []
|
239 |
concat_choices = []
|
240 |
all_sheet_names = []
|
@@ -272,9 +415,9 @@ def put_columns_in_df(in_file):
|
|
272 |
concat_choices = sorted(set(concat_choices))
|
273 |
|
274 |
if number_of_excel_files > 0:
|
275 |
-
return gr.Dropdown(choices=concat_choices, value=concat_choices[0]), gr.Dropdown(choices=all_sheet_names, value=all_sheet_names[0], visible=True, interactive=True), file_end
|
276 |
else:
|
277 |
-
return gr.Dropdown(choices=concat_choices, value=concat_choices[0]), gr.Dropdown(visible=False), file_end
|
278 |
|
279 |
# Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
|
280 |
def add_folder_to_path(folder_path: str):
|
|
|
1 |
import os
|
2 |
+
import re
|
3 |
import gradio as gr
|
4 |
import pandas as pd
|
5 |
+
from typing import List
|
6 |
+
import math
|
7 |
|
8 |
def empty_output_vars_extract_topics():
|
9 |
# Empty output objects before processing a new file
|
|
|
49 |
|
50 |
return value
|
51 |
|
52 |
+
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "1")
|
53 |
print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
|
54 |
|
55 |
RUN_LOCAL_MODEL = get_or_create_env_var("RUN_LOCAL_MODEL", "1")
|
56 |
print(f'The value of RUN_LOCAL_MODEL is {RUN_LOCAL_MODEL}')
|
57 |
|
58 |
+
RUN_GEMINI_MODELS = get_or_create_env_var("RUN_GEMINI_MODELS", "1")
|
59 |
+
print(f'The value of RUN_GEMINI_MODELS is {RUN_GEMINI_MODELS}')
|
60 |
+
|
61 |
+
GEMINI_API_KEY = get_or_create_env_var('GEMINI_API_KEY', '')
|
62 |
+
|
63 |
+
# Build up options for models
|
64 |
+
model_full_names = []
|
65 |
+
model_short_names = []
|
66 |
+
|
67 |
+
if RUN_LOCAL_MODEL == "1":
|
68 |
+
model_full_names.append("gemma_2b_it_local")
|
69 |
+
model_short_names.append("gemma_local")
|
70 |
+
|
71 |
if RUN_AWS_FUNCTIONS == "1":
|
72 |
+
model_full_names.extend(["anthropic.claude-3-haiku-20240307-v1:0", "anthropic.claude-3-sonnet-20240229-v1:0"])
|
73 |
+
model_short_names.extend(["haiku", "sonnet"])
|
74 |
+
|
75 |
+
if RUN_GEMINI_MODELS == "1":
|
76 |
+
model_full_names.extend(["gemini-2.0-flash-001", "gemini-2.5-flash-preview-05-20", "gemini-2.5-pro-exp-05-06" ]) # , # Gemini pro No longer available on free tier
|
77 |
+
model_short_names.extend(["gemini_flash_2", "gemini_flash_2.5", "gemini_pro"])
|
78 |
|
79 |
+
print("model_short_names:", model_short_names)
|
80 |
+
print("model_full_names:", model_full_names)
|
|
|
81 |
|
82 |
model_name_map = {short: full for short, full in zip(model_full_names, model_short_names)}
|
83 |
|
|
|
139 |
elif file_type == 'parquet':
|
140 |
return pd.read_parquet(filename)
|
141 |
|
142 |
+
def load_in_file(file_path: str, colnames:List[str]="", excel_sheet:str=""):
|
143 |
+
"""
|
144 |
+
Loads in a tabular data file and returns data and file name.
|
145 |
+
|
146 |
+
Parameters:
|
147 |
+
- file_path (str): The path to the file to be processed.
|
148 |
+
- colnames (List[str], optional): list of colnames to load in
|
149 |
+
"""
|
150 |
+
|
151 |
+
#file_type = detect_file_type(file_path)
|
152 |
+
#print("File type is:", file_type)
|
153 |
+
|
154 |
+
file_name = get_file_name_no_ext(file_path)
|
155 |
+
file_data = read_file(file_path, excel_sheet)
|
156 |
+
|
157 |
+
if colnames and isinstance(colnames, list):
|
158 |
+
col_list = colnames
|
159 |
+
else:
|
160 |
+
col_list = list(file_data.columns)
|
161 |
+
|
162 |
+
if not isinstance(col_list, List):
|
163 |
+
col_list = [col_list]
|
164 |
+
|
165 |
+
col_list = [item for item in col_list if item not in ["", "NA"]]
|
166 |
+
|
167 |
+
for col in col_list:
|
168 |
+
file_data[col] = file_data[col].fillna("")
|
169 |
+
file_data[col] = file_data[col].astype(str).str.replace("\bnan\b", "", regex=True)
|
170 |
+
|
171 |
+
#print(file_data[colnames])
|
172 |
+
|
173 |
+
return file_data, file_name
|
174 |
+
|
175 |
+
def load_in_data_file(file_paths:List[str], in_colnames:List[str], batch_size:int=50, in_excel_sheets:str=""):
|
176 |
+
'''Load in data table, work out how many batches needed.'''
|
177 |
+
|
178 |
+
if not isinstance(in_colnames, list):
|
179 |
+
in_colnames = [in_colnames]
|
180 |
+
|
181 |
+
#print("in_colnames:", in_colnames)
|
182 |
+
|
183 |
+
try:
|
184 |
+
file_data, file_name = load_in_file(file_paths[0], colnames=in_colnames, excel_sheet=in_excel_sheets)
|
185 |
+
num_batches = math.ceil(len(file_data) / batch_size)
|
186 |
+
print("Total number of batches:", num_batches)
|
187 |
+
|
188 |
+
except Exception as e:
|
189 |
+
print(e)
|
190 |
+
file_data = pd.DataFrame()
|
191 |
+
file_name = ""
|
192 |
+
num_batches = 1
|
193 |
+
|
194 |
+
return file_data, file_name, num_batches
|
195 |
+
|
196 |
+
def load_in_previous_reference_file(file:str):
|
197 |
+
'''Load in data table from a partially completed consultation summary to continue it.'''
|
198 |
+
|
199 |
+
reference_file_data = pd.DataFrame()
|
200 |
+
reference_file_name = ""
|
201 |
+
out_message = ""
|
202 |
+
|
203 |
+
#for file in file_paths:
|
204 |
+
|
205 |
+
print("file:", file)
|
206 |
+
|
207 |
+
# If reference table
|
208 |
+
if 'reference_table' in file:
|
209 |
+
try:
|
210 |
+
reference_file_data, reference_file_name = load_in_file(file)
|
211 |
+
#print("reference_file_data:", reference_file_data.head(2))
|
212 |
+
out_message = out_message + " Reference file load successful."
|
213 |
+
except Exception as e:
|
214 |
+
out_message = "Could not load reference file data:" + str(e)
|
215 |
+
raise Exception("Could not load reference file data:", e)
|
216 |
+
|
217 |
+
if reference_file_data.empty:
|
218 |
+
out_message = out_message + " No reference data table provided."
|
219 |
+
raise Exception(out_message)
|
220 |
+
|
221 |
+
print(out_message)
|
222 |
+
|
223 |
+
return reference_file_data, reference_file_name
|
224 |
+
|
225 |
+
def join_cols_onto_reference_df(reference_df:pd.DataFrame, original_data_df:pd.DataFrame, join_columns:List[str], original_file_name:str, output_folder:str=output_folder):
|
226 |
+
|
227 |
+
#print("original_data_df columns:", original_data_df.columns)
|
228 |
+
#print("original_data_df:", original_data_df)
|
229 |
+
|
230 |
+
original_data_df.reset_index(names="Response References", inplace=True)
|
231 |
+
original_data_df["Response References"] += 1
|
232 |
+
|
233 |
+
#print("reference_df columns:", reference_df.columns)
|
234 |
+
#print("reference_df:", reference_df)
|
235 |
+
|
236 |
+
join_columns.append("Response References")
|
237 |
+
|
238 |
+
reference_df["Response References"] = reference_df["Response References"].fillna("-1").astype(int)
|
239 |
+
|
240 |
+
save_file_name = output_folder + original_file_name + "_j.csv"
|
241 |
+
|
242 |
+
out_reference_df = reference_df.merge(original_data_df[join_columns], on = "Response References", how="left")
|
243 |
+
out_reference_df.to_csv(save_file_name, index=None)
|
244 |
+
|
245 |
+
file_data_outputs = [save_file_name]
|
246 |
+
|
247 |
+
return out_reference_df, file_data_outputs
|
248 |
+
|
249 |
# Wrap text in each column to the specified max width, including whole words
|
250 |
def wrap_text(text:str, max_width=60, max_text_length=None):
|
251 |
if not isinstance(text, str):
|
|
|
332 |
|
333 |
return '<br>'.join(wrapped_lines)
|
334 |
|
335 |
+
def initial_clean(text):
|
336 |
+
#### Some of my cleaning functions
|
337 |
+
html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0| '
|
338 |
+
html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
|
339 |
+
non_ascii_pattern = r'[^\x00-\x7F]+'
|
340 |
+
multiple_spaces_regex = r'\s{2,}'
|
341 |
+
|
342 |
+
# Define a list of patterns and their replacements
|
343 |
+
patterns = [
|
344 |
+
(html_pattern_regex, ' '),
|
345 |
+
(html_start_pattern_end_dots_regex, ' '),
|
346 |
+
(non_ascii_pattern, ' '),
|
347 |
+
(multiple_spaces_regex, ' ')
|
348 |
+
]
|
349 |
+
|
350 |
+
# Apply each regex replacement
|
351 |
+
for pattern, replacement in patterns:
|
352 |
+
text = re.sub(pattern, replacement, text)
|
353 |
+
|
354 |
+
return text
|
355 |
|
356 |
def view_table(file_path: str): # Added max_width parameter
|
357 |
df = pd.read_csv(file_path)
|
|
|
377 |
else:
|
378 |
print(f"The 'output/' folder already exists.")
|
379 |
|
380 |
+
def put_columns_in_df(in_file:List[str]):
|
381 |
new_choices = []
|
382 |
concat_choices = []
|
383 |
all_sheet_names = []
|
|
|
415 |
concat_choices = sorted(set(concat_choices))
|
416 |
|
417 |
if number_of_excel_files > 0:
|
418 |
+
return gr.Dropdown(choices=concat_choices, value=concat_choices[0]), gr.Dropdown(choices=all_sheet_names, value=all_sheet_names[0], visible=True, interactive=True), file_end, gr.Dropdown(choices=concat_choices)
|
419 |
else:
|
420 |
+
return gr.Dropdown(choices=concat_choices, value=concat_choices[0]), gr.Dropdown(visible=False), file_end, gr.Dropdown(choices=concat_choices)
|
421 |
|
422 |
# Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
|
423 |
def add_folder_to_path(folder_path: str):
|
tools/llm_api_call.py
CHANGED
@@ -19,8 +19,8 @@ from io import StringIO
|
|
19 |
|
20 |
GradioFileData = gr.FileData
|
21 |
|
22 |
-
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt, force_existing_topics_prompt, allow_new_topics_prompt
|
23 |
-
from tools.helper_functions import output_folder,
|
24 |
from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
|
25 |
|
26 |
# ResponseObject class for AWS Bedrock calls
|
@@ -59,62 +59,6 @@ def normalise_string(text):
|
|
59 |
|
60 |
return text
|
61 |
|
62 |
-
def load_in_file(file_path: str, colnames:List[str]="", excel_sheet:str=""):
|
63 |
-
"""
|
64 |
-
Loads in a tabular data file and returns data and file name.
|
65 |
-
|
66 |
-
Parameters:
|
67 |
-
- file_path (str): The path to the file to be processed.
|
68 |
-
- colnames (List[str], optional): list of colnames to load in
|
69 |
-
"""
|
70 |
-
|
71 |
-
file_type = detect_file_type(file_path)
|
72 |
-
#print("File type is:", file_type)
|
73 |
-
|
74 |
-
file_name = get_file_name_no_ext(file_path)
|
75 |
-
file_data = read_file(file_path, excel_sheet)
|
76 |
-
|
77 |
-
print("colnames:", colnames)
|
78 |
-
|
79 |
-
if colnames and isinstance(colnames, list):
|
80 |
-
col_list = colnames
|
81 |
-
else:
|
82 |
-
col_list = list(file_data.columns)
|
83 |
-
|
84 |
-
if not isinstance(col_list, List):
|
85 |
-
col_list = [col_list]
|
86 |
-
|
87 |
-
col_list = [item for item in col_list if item not in ["", "NA"]]
|
88 |
-
|
89 |
-
for col in col_list:
|
90 |
-
file_data[col] = file_data[col].fillna("")
|
91 |
-
file_data[col] = file_data[col].astype(str).str.replace("\bnan\b", "", regex=True)
|
92 |
-
|
93 |
-
#print(file_data[colnames])
|
94 |
-
|
95 |
-
return file_data, file_name
|
96 |
-
|
97 |
-
def load_in_data_file(file_paths:List[str], in_colnames:List[str], batch_size:int=50, in_excel_sheets:str=""):
|
98 |
-
'''Load in data table, work out how many batches needed.'''
|
99 |
-
|
100 |
-
if not isinstance(in_colnames, list):
|
101 |
-
in_colnames = [in_colnames]
|
102 |
-
|
103 |
-
print("in_colnames:", in_colnames)
|
104 |
-
|
105 |
-
try:
|
106 |
-
file_data, file_name = load_in_file(file_paths[0], colnames=in_colnames, excel_sheet=in_excel_sheets)
|
107 |
-
num_batches = math.ceil(len(file_data) / batch_size)
|
108 |
-
print("Total number of batches:", num_batches)
|
109 |
-
|
110 |
-
except Exception as e:
|
111 |
-
print(e)
|
112 |
-
file_data = pd.DataFrame()
|
113 |
-
file_name = ""
|
114 |
-
num_batches = 1
|
115 |
-
|
116 |
-
return file_data, file_name, num_batches
|
117 |
-
|
118 |
def load_in_previous_data_files(file_paths_partial_output:List[str], for_modified_table:bool=False):
|
119 |
'''Load in data table from a partially completed consultation summary to continue it.'''
|
120 |
|
@@ -186,6 +130,7 @@ def load_in_previous_data_files(file_paths_partial_output:List[str], for_modifie
|
|
186 |
|
187 |
return gr.Dataframe(value=unique_file_data, headers=None, col_count=(unique_file_data.shape[1], "fixed"), row_count = (unique_file_data.shape[0], "fixed"), visible=True, type="pandas"), reference_file_data, unique_file_data, reference_file_name, unique_file_name, out_file_names
|
188 |
|
|
|
189 |
def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str], verify_titles:bool=False) -> pd.DataFrame:
|
190 |
|
191 |
if not isinstance(chosen_cols, list):
|
@@ -199,10 +144,12 @@ def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str], verif
|
|
199 |
if verify_titles == True:
|
200 |
basic_response_data = basic_response_data.rename(columns={chosen_cols[0]: "Response", chosen_cols[1]: "Title"})
|
201 |
basic_response_data["Title"] = basic_response_data["Title"].str.strip()
|
|
|
202 |
else:
|
203 |
basic_response_data = basic_response_data.rename(columns={chosen_cols[0]: "Response"})
|
204 |
|
205 |
basic_response_data["Response"] = basic_response_data["Response"].str.strip()
|
|
|
206 |
|
207 |
return basic_response_data
|
208 |
|
@@ -245,12 +192,12 @@ def data_file_to_markdown_table(file_data:pd.DataFrame, file_name:str, chosen_co
|
|
245 |
else:
|
246 |
end_row = file_len + 1
|
247 |
|
248 |
-
print("start_row:", start_row)
|
249 |
-
print("end_row:", end_row)
|
250 |
|
251 |
batch_basic_response_data = basic_response_data[start_row:end_row] # Select the current batch
|
252 |
|
253 |
-
print("batch_basic_response_data:", batch_basic_response_data)
|
254 |
|
255 |
# Now replace the reference numbers with numbers starting from 1
|
256 |
batch_basic_response_data.loc[:, "Reference"] = batch_basic_response_data["Reference"] - start_row
|
@@ -398,7 +345,7 @@ def call_aws_claude(prompt: str, system_prompt: str, temperature: float, max_tok
|
|
398 |
|
399 |
# Now you can access both the text and metadata
|
400 |
#print("Text:", response.text)
|
401 |
-
print("Metadata:", response.usage_metadata)
|
402 |
#print("Text:", response.text)
|
403 |
|
404 |
return response
|
@@ -428,7 +375,7 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
|
|
428 |
progress_bar = range(0,number_of_api_retry_attempts)
|
429 |
|
430 |
# Generate the model's response
|
431 |
-
if
|
432 |
|
433 |
for i in progress_bar:
|
434 |
try:
|
@@ -451,7 +398,7 @@ def send_request(prompt: str, conversation_history: List[dict], model: object, c
|
|
451 |
|
452 |
if i == number_of_api_retry_attempts:
|
453 |
return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
|
454 |
-
elif
|
455 |
for i in progress_bar:
|
456 |
try:
|
457 |
print("Calling AWS Claude model, attempt", i + 1)
|
@@ -661,70 +608,6 @@ def clean_markdown_table(text: str):
|
|
661 |
|
662 |
return result
|
663 |
|
664 |
-
# def clean_markdown_table(text: str):
|
665 |
-
# lines = text.splitlines()
|
666 |
-
|
667 |
-
# # Remove any empty rows or rows with only pipes
|
668 |
-
# cleaned_lines = [line for line in lines if not re.match(r'^\s*\|?\s*\|?\s*$', line)]
|
669 |
-
|
670 |
-
# # Merge lines that belong to the same row (i.e., don't start with |)
|
671 |
-
# merged_lines = []
|
672 |
-
# buffer = ""
|
673 |
-
|
674 |
-
# for line in cleaned_lines:
|
675 |
-
# if line.lstrip().startswith('|'): # If line starts with |, it's a new row
|
676 |
-
# if buffer:
|
677 |
-
# merged_lines.append(buffer) # Append the buffered content
|
678 |
-
# buffer = line # Start a new buffer with this row
|
679 |
-
# else:
|
680 |
-
# # Continuation of the previous row
|
681 |
-
# buffer += ' ' + line.strip() # Add content to the current buffer
|
682 |
-
|
683 |
-
# # Don't forget to append the last buffer
|
684 |
-
# if buffer:
|
685 |
-
# merged_lines.append(buffer)
|
686 |
-
|
687 |
-
# # Fix the header separator row if necessary
|
688 |
-
# if len(merged_lines) > 1:
|
689 |
-
# header_pipes = merged_lines[0].count('|') # Count pipes in the header row
|
690 |
-
# header_separator = '|---|' * (header_pipes - 1) + '|---|' # Generate proper separator
|
691 |
-
|
692 |
-
# # Replace or insert the separator row
|
693 |
-
# if not re.match(r'^\|[-:|]+$', merged_lines[1]): # Check if the second row is a valid separator
|
694 |
-
# merged_lines.insert(1, header_separator)
|
695 |
-
# else:
|
696 |
-
# # Adjust the separator to match the header pipes
|
697 |
-
# merged_lines[1] = '|---|' * (header_pipes - 1) + '|'
|
698 |
-
|
699 |
-
# # Ensure consistent number of pipes in each row
|
700 |
-
# result = []
|
701 |
-
# header_pipes = merged_lines[0].count('|') # Use the header row to count the number of pipes
|
702 |
-
|
703 |
-
# for line in merged_lines:
|
704 |
-
# # Strip excessive whitespace around pipes
|
705 |
-
# line = re.sub(r'\s*\|\s*', '|', line.strip())
|
706 |
-
|
707 |
-
# # Fix inconsistent number of pipes by adjusting them to match the header
|
708 |
-
# pipe_count = line.count('|')
|
709 |
-
# if pipe_count < header_pipes:
|
710 |
-
# line += '|' * (header_pipes - pipe_count) # Add missing pipes
|
711 |
-
# elif pipe_count > header_pipes:
|
712 |
-
# # If too many pipes, split line and keep the first `header_pipes` columns
|
713 |
-
# columns = line.split('|')[:header_pipes + 1] # +1 to keep last pipe at the end
|
714 |
-
# line = '|'.join(columns)
|
715 |
-
|
716 |
-
# line = re.sub(r'(\d),(?=\d)', r'\1, ', line)
|
717 |
-
|
718 |
-
# result.append(line)
|
719 |
-
|
720 |
-
# # Join lines back into the cleaned markdown text
|
721 |
-
# cleaned_text = '\n'.join(result)
|
722 |
-
|
723 |
-
# # Replace numbers next to commas and other numbers with a space
|
724 |
-
|
725 |
-
|
726 |
-
# return cleaned_text
|
727 |
-
|
728 |
def clean_column_name(column_name, max_length=20):
|
729 |
# Convert to string
|
730 |
column_name = str(column_name)
|
@@ -751,31 +634,6 @@ def create_unique_table_df_from_reference_table(reference_df:pd.DataFrame):
|
|
751 |
.assign(Topic_number=lambda df: np.arange(1, len(df) + 1)) # Add numbering 1 to x
|
752 |
)
|
753 |
|
754 |
-
# new_unique_topics_df = reference_df[["General Topic", "Subtopic", "Sentiment"]]
|
755 |
-
|
756 |
-
# new_unique_topics_df = new_unique_topics_df.rename(columns={new_unique_topics_df.columns[0]: "General Topic", new_unique_topics_df.columns[1]: "Subtopic", new_unique_topics_df.columns[2]: "Sentiment"})
|
757 |
-
|
758 |
-
# # Join existing and new unique topics
|
759 |
-
# out_unique_topics_df = new_unique_topics_df
|
760 |
-
|
761 |
-
# out_unique_topics_df = out_unique_topics_df.rename(columns={out_unique_topics_df.columns[0]: "General Topic", out_unique_topics_df.columns[1]: "Subtopic", out_unique_topics_df.columns[2]: "Sentiment"})
|
762 |
-
|
763 |
-
# #print("out_unique_topics_df:", out_unique_topics_df)
|
764 |
-
|
765 |
-
# out_unique_topics_df = out_unique_topics_df.drop_duplicates(["General Topic", "Subtopic", "Sentiment"]).\
|
766 |
-
# drop(["Response References", "Summary"], axis = 1, errors="ignore")
|
767 |
-
|
768 |
-
# # Get count of rows that refer to particular topics
|
769 |
-
# reference_counts = reference_df.groupby(["General Topic", "Subtopic", "Sentiment"]).agg({
|
770 |
-
# 'Response References': 'size', # Count the number of references
|
771 |
-
# 'Summary': lambda x: '<br>'.join(
|
772 |
-
# sorted(set(x), key=lambda summary: reference_df.loc[reference_df['Summary'] == summary, 'Start row of group'].min())
|
773 |
-
# )
|
774 |
-
# }).reset_index()
|
775 |
-
|
776 |
-
# # Join the counts to existing_unique_topics_df
|
777 |
-
# out_unique_topics_df = out_unique_topics_df.merge(reference_counts, how='left', on=["General Topic", "Subtopic", "Sentiment"]).sort_values("Response References", ascending=False)
|
778 |
-
|
779 |
return out_unique_topics_df
|
780 |
|
781 |
# Convert output table to markdown and then to a pandas dataframe to csv
|
@@ -933,8 +791,6 @@ def call_llm_with_markdown_table_checks(batch_prompts: List[str],
|
|
933 |
call_temperature, reported_batch_no, local_model, master=master
|
934 |
)
|
935 |
|
936 |
-
print("Responses:", responses)
|
937 |
-
|
938 |
if (model_choice != "gemma_local") & (model_choice != "gemma_2b_it_local"):
|
939 |
stripped_response = responses[-1].text.strip()
|
940 |
else:
|
@@ -1041,7 +897,16 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
|
|
1041 |
return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
|
1042 |
|
1043 |
# Rename columns to ensure consistent use of data frames later in code
|
1044 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1045 |
|
1046 |
# Fill in NA rows with values from above (topics seem to be included only on one row):
|
1047 |
topic_with_response_df = topic_with_response_df.ffill()
|
@@ -1073,8 +938,8 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
|
|
1073 |
sentiment = row.iloc[2] if pd.notna(row.iloc[2]) else ""
|
1074 |
summary = row.iloc[4] if pd.notna(row.iloc[4]) else ""
|
1075 |
# If the reference response column is very long, and there's nothing in the summary column, assume that the summary was put in the reference column
|
1076 |
-
if not summary and len(row.iloc[3] > 30):
|
1077 |
-
summary = row.iloc[3]
|
1078 |
|
1079 |
summary = row_number_string_start + summary
|
1080 |
|
@@ -1151,6 +1016,128 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
|
|
1151 |
|
1152 |
return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
|
1153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1154 |
@spaces.GPU
|
1155 |
def extract_topics(in_data_file,
|
1156 |
file_data:pd.DataFrame,
|
@@ -1184,6 +1171,8 @@ def extract_topics(in_data_file,
|
|
1184 |
sentiment_checkbox:str = "Negative, Neutral, or Positive",
|
1185 |
force_zero_shot_radio:str = "No",
|
1186 |
in_excel_sheets:List[str] = [],
|
|
|
|
|
1187 |
max_tokens:int=max_tokens,
|
1188 |
model_name_map:dict=model_name_map,
|
1189 |
max_time_for_loop:int=max_time_for_loop,
|
@@ -1224,7 +1213,9 @@ def extract_topics(in_data_file,
|
|
1224 |
- time_taken (float, optional): The amount of time taken to process the responses up until this point.
|
1225 |
- sentiment_checkbox (str, optional): What type of sentiment analysis should the topic modeller do?
|
1226 |
- force_zero_shot_radio (str, optional): Should responses be forced into a zero shot topic or not.
|
1227 |
-
- in_excel_sheets (List[str], optional): List of excel sheets to load from input file
|
|
|
|
|
1228 |
- max_tokens (int): The maximum number of tokens for the model.
|
1229 |
- model_name_map (dict, optional): A dictionary mapping full model name to shortened.
|
1230 |
- max_time_for_loop (int, optional): The number of seconds maximum that the function should run for before breaking (to run again, this is to avoid timeouts with some AWS services if deployed there).
|
@@ -1254,17 +1245,13 @@ def extract_topics(in_data_file,
|
|
1254 |
if file_data.empty:
|
1255 |
print("No data table found, loading from file")
|
1256 |
try:
|
1257 |
-
#print("in_data_file:", in_data_file)
|
1258 |
in_colnames_drop, in_excel_sheets, file_name = put_columns_in_df(in_data_file)
|
1259 |
-
#print("in_colnames:", in_colnames_drop)
|
1260 |
file_data, file_name, num_batches = load_in_data_file(in_data_file, chosen_cols, batch_size_default, in_excel_sheets)
|
1261 |
-
#print("file_data loaded in:", file_data)
|
1262 |
except:
|
1263 |
# Check if files and text exist
|
1264 |
out_message = "Please enter a data file to summarise."
|
1265 |
print(out_message)
|
1266 |
raise Exception(out_message)
|
1267 |
-
#return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths#, out_message
|
1268 |
|
1269 |
|
1270 |
#model_choice_clean = replace_punctuation_with_underscore(model_choice)
|
@@ -1277,12 +1264,10 @@ def extract_topics(in_data_file,
|
|
1277 |
latest_batch_completed = 0
|
1278 |
out_message = []
|
1279 |
out_file_paths = []
|
1280 |
-
#print("model_choice_clean:", model_choice_clean)
|
1281 |
|
1282 |
if (model_choice == "gemma_2b_it_local") & (RUN_LOCAL_MODEL == "1"):
|
1283 |
progress(0.1, "Loading in Gemma 2b model")
|
1284 |
local_model, tokenizer = load_model()
|
1285 |
-
print("Local model loaded:", local_model)
|
1286 |
|
1287 |
if num_batches > 0:
|
1288 |
progress_measure = round(latest_batch_completed / num_batches, 1)
|
@@ -1301,12 +1286,10 @@ def extract_topics(in_data_file,
|
|
1301 |
out_file_paths = []
|
1302 |
|
1303 |
|
1304 |
-
if
|
1305 |
out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
|
1306 |
print(out_message)
|
1307 |
-
raise Exception(out_message)
|
1308 |
-
#return out_message, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths#, out_message
|
1309 |
-
|
1310 |
|
1311 |
if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
|
1312 |
elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative or Positive"
|
@@ -1337,10 +1320,10 @@ def extract_topics(in_data_file,
|
|
1337 |
if latest_batch_completed >= 1 or candidate_topics is not None:
|
1338 |
|
1339 |
# Prepare Gemini models before query
|
1340 |
-
if
|
1341 |
print("Using Gemini model:", model_choice)
|
1342 |
model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens)
|
1343 |
-
elif
|
1344 |
print("Using AWS Bedrock model:", model_choice)
|
1345 |
else:
|
1346 |
print("Using local model:", model_choice)
|
@@ -1351,109 +1334,17 @@ def extract_topics(in_data_file,
|
|
1351 |
|
1352 |
# 'Zero shot topics' are those supplied by the user
|
1353 |
max_topic_no = 120
|
1354 |
-
zero_shot_topics = read_file(candidate_topics.name)
|
1355 |
-
|
1356 |
-
|
1357 |
-
|
1358 |
-
|
1359 |
-
|
1360 |
-
|
1361 |
-
|
1362 |
-
|
1363 |
-
|
1364 |
-
|
1365 |
-
zero_shot_topics.loc[:, x]
|
1366 |
-
.str.strip()
|
1367 |
-
.str.replace('\n', ' ')
|
1368 |
-
.str.replace('\r', ' ')
|
1369 |
-
.str.replace('/', ' or ')
|
1370 |
-
.str.lower()
|
1371 |
-
.str.capitalize())
|
1372 |
-
|
1373 |
-
# If number of columns is 1, keep only subtopics
|
1374 |
-
if zero_shot_topics.shape[1] == 1 and "General Topic" not in zero_shot_topics.columns:
|
1375 |
-
zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
|
1376 |
-
zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
|
1377 |
-
# Allow for possibility that the user only wants to set general topics and not subtopics
|
1378 |
-
elif zero_shot_topics.shape[1] == 1 and "General Topic" in zero_shot_topics.columns:
|
1379 |
-
zero_shot_topics_gen_topics_list = list(zero_shot_topics["General Topic"])
|
1380 |
-
zero_shot_topics_subtopics_list = [""] * zero_shot_topics.shape[0]
|
1381 |
-
# If general topic and subtopic are specified
|
1382 |
-
elif set(["General Topic", "Subtopic"]).issubset(zero_shot_topics.columns):
|
1383 |
-
zero_shot_topics_gen_topics_list = list(zero_shot_topics["General Topic"])
|
1384 |
-
zero_shot_topics_subtopics_list = list(zero_shot_topics["Subtopic"])
|
1385 |
-
# If number of columns is 2, keep general topics and subtopics
|
1386 |
-
elif zero_shot_topics.shape[1] == 2:
|
1387 |
-
zero_shot_topics_gen_topics_list = list(zero_shot_topics.iloc[:, 0])
|
1388 |
-
zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 1])
|
1389 |
-
else:
|
1390 |
-
# If there are more columns, just assume that the first column was meant to be a subtopic
|
1391 |
-
zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
|
1392 |
-
zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
|
1393 |
-
|
1394 |
-
# If the responses are being forced into zero shot topics, allow an option for nothing relevant
|
1395 |
-
if force_zero_shot_radio == "Yes":
|
1396 |
-
zero_shot_topics_gen_topics_list.append("")
|
1397 |
-
zero_shot_topics_subtopics_list.append("No topics are relevant to the response")
|
1398 |
-
|
1399 |
-
if create_revised_general_topics == True:
|
1400 |
-
# Create the most up to date list of topics and subtopics.
|
1401 |
-
# If there are candidate topics, but the existing_unique_topics_df hasn't yet been constructed, then create.
|
1402 |
-
unique_topics_df = pd.DataFrame(data={
|
1403 |
-
"General Topic":zero_shot_topics_gen_topics_list,
|
1404 |
-
"Subtopic":zero_shot_topics_subtopics_list
|
1405 |
-
})
|
1406 |
-
unique_topics_markdown = unique_topics_df.to_markdown()
|
1407 |
-
|
1408 |
-
print("unique_topics_markdown:", unique_topics_markdown)
|
1409 |
-
|
1410 |
-
formatted_general_topics_system_prompt = create_general_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
|
1411 |
-
|
1412 |
-
# Format the general_topics prompt with the topics
|
1413 |
-
formatted_general_topics_prompt = create_general_topics_prompt.format(topics=unique_topics_markdown)
|
1414 |
-
|
1415 |
-
if model_choice == "gemma_2b_it_local":
|
1416 |
-
formatted_general_topics_prompt = llama_cpp_prefix + formatted_general_topics_system_prompt + "\n" + formatted_general_topics_prompt + llama_cpp_suffix
|
1417 |
-
|
1418 |
-
formatted_general_topics_prompt_list = [formatted_general_topics_prompt]
|
1419 |
-
|
1420 |
-
|
1421 |
-
|
1422 |
-
whole_conversation = []
|
1423 |
-
|
1424 |
-
general_topic_response, general_topic_conversation_history, general_topic_conversation, general_topic_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
|
1425 |
-
|
1426 |
-
# Convert response text to a markdown table
|
1427 |
-
try:
|
1428 |
-
zero_shot_topics_df, is_error = convert_response_text_to_markdown_table(response_text, table_type = "Revised topics table")
|
1429 |
-
print("Output revised zero shot topics table is:", zero_shot_topics_df)
|
1430 |
-
|
1431 |
-
zero_shot_revised_path = output_folder + "zero_shot_topics_with_general_topics.csv"
|
1432 |
-
#zero_shot_topics_df.to_csv(zero_shot_revised_path, index = None)
|
1433 |
-
out_file_paths.append(zero_shot_revised_path)
|
1434 |
-
|
1435 |
-
except Exception as e:
|
1436 |
-
print("Error in parsing markdown table from response text:", e, "Not adding revised General Topics to table")
|
1437 |
-
zero_shot_topics_df = pd.DataFrame(data={
|
1438 |
-
"General Topic":zero_shot_topics_gen_topics_list,
|
1439 |
-
"Subtopic":zero_shot_topics_subtopics_list})
|
1440 |
-
|
1441 |
-
if zero_shot_topics_df.empty:
|
1442 |
-
print("Creation of revised general topics df failed, reverting to original list")
|
1443 |
-
zero_shot_topics_df = pd.DataFrame(data={
|
1444 |
-
"General Topic":zero_shot_topics_gen_topics_list,
|
1445 |
-
"Subtopic":zero_shot_topics_subtopics_list})
|
1446 |
-
else:
|
1447 |
-
zero_shot_topics_df = pd.DataFrame(data={
|
1448 |
-
"General Topic":zero_shot_topics_gen_topics_list,
|
1449 |
-
"Subtopic":zero_shot_topics_subtopics_list})
|
1450 |
-
|
1451 |
-
|
1452 |
-
# This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
|
1453 |
-
if not existing_unique_topics_df.empty:
|
1454 |
-
existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df]).drop_duplicates("Subtopic")
|
1455 |
-
else:
|
1456 |
-
existing_unique_topics_df = zero_shot_topics_df
|
1457 |
|
1458 |
if candidate_topics and not zero_shot_topics_df.empty:
|
1459 |
# If you have already created revised zero shot topics, concat to the current
|
@@ -1464,24 +1355,40 @@ def extract_topics(in_data_file,
|
|
1464 |
existing_unique_topics_df.fillna("", inplace=True)
|
1465 |
existing_unique_topics_df["General Topic"] = existing_unique_topics_df["General Topic"].str.replace('(?i)^Nan$', '', regex=True)
|
1466 |
existing_unique_topics_df["Subtopic"] = existing_unique_topics_df["Subtopic"].str.replace('(?i)^Nan$', '', regex=True)
|
|
|
|
|
|
|
|
|
1467 |
|
1468 |
# print("existing_unique_topics_df:", existing_unique_topics_df)
|
1469 |
|
1470 |
# If user has chosen to try to force zero shot topics, then the prompt is changed to ask the model not to deviate at all from submitted topic list.
|
1471 |
-
|
1472 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1473 |
topic_assignment_prompt = force_existing_topics_prompt
|
1474 |
else:
|
1475 |
-
|
1476 |
-
|
1477 |
-
|
|
|
|
|
|
|
|
|
|
|
1478 |
|
1479 |
# Format the summary prompt with the response table and topics
|
1480 |
formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
|
1481 |
-
formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown, topic_assignment=topic_assignment_prompt, sentiment_choices=sentiment_prompt)
|
1482 |
|
1483 |
|
1484 |
-
if
|
1485 |
formatted_summary_prompt = llama_cpp_prefix + formatted_system_prompt + "\n" + formatted_summary_prompt + llama_cpp_suffix
|
1486 |
full_prompt = formatted_summary_prompt
|
1487 |
else:
|
@@ -1499,7 +1406,7 @@ def extract_topics(in_data_file,
|
|
1499 |
except Exception as e:
|
1500 |
print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
|
1501 |
|
1502 |
-
if
|
1503 |
summary_prompt_list = [full_prompt] # Includes system prompt
|
1504 |
else:
|
1505 |
summary_prompt_list = [formatted_summary_prompt]
|
@@ -1510,13 +1417,9 @@ def extract_topics(in_data_file,
|
|
1510 |
whole_conversation = []
|
1511 |
|
1512 |
# Process requests to large language model
|
1513 |
-
# responses, summary_conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(summary_prompt_list, add_existing_topics_system_prompt, summary_conversation_history, summary_whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, master = True)
|
1514 |
-
|
1515 |
responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
|
1516 |
|
1517 |
-
#
|
1518 |
-
# print("Whole conversation metadata:", whole_conversation_metadata)
|
1519 |
-
|
1520 |
topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, new_topic_df, new_markdown_table, new_reference_df, new_unique_topics_df, master_batch_out_file_part, is_error = write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, first_run=False)
|
1521 |
|
1522 |
# Write final output to text file for logging purposes
|
@@ -1541,7 +1444,6 @@ def extract_topics(in_data_file,
|
|
1541 |
if is_error == True:
|
1542 |
final_message_out = "Could not complete summary, error in LLM output."
|
1543 |
raise Exception(final_message_out)
|
1544 |
-
#return unique_table_df_display_table_markdown, new_topic_df, new_unique_topics_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
|
1545 |
|
1546 |
# Write outputs to csv
|
1547 |
## Topics with references
|
@@ -1560,7 +1462,7 @@ def extract_topics(in_data_file,
|
|
1560 |
|
1561 |
# Outputs for markdown table output
|
1562 |
unique_table_df_display_table = new_unique_topics_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
|
1563 |
-
unique_table_df_display_table_markdown = unique_table_df_display_table.to_markdown(index=False)
|
1564 |
|
1565 |
#whole_conversation_metadata.append(whole_conversation_metadata_str)
|
1566 |
whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
|
@@ -1579,11 +1481,11 @@ def extract_topics(in_data_file,
|
|
1579 |
#system_prompt = system_prompt + normalised_simple_markdown_table
|
1580 |
|
1581 |
# Prepare Gemini models before query
|
1582 |
-
if
|
1583 |
print("Using Gemini model:", model_choice)
|
1584 |
model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
|
1585 |
-
elif
|
1586 |
-
print("Using local Gemma
|
1587 |
else:
|
1588 |
print("Using AWS Bedrock model:", model_choice)
|
1589 |
|
@@ -1597,7 +1499,7 @@ def extract_topics(in_data_file,
|
|
1597 |
if prompt3: formatted_prompt3 = prompt3.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt)
|
1598 |
else: formatted_prompt3 = prompt3
|
1599 |
|
1600 |
-
if
|
1601 |
formatted_initial_table_prompt = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_initial_table_prompt + llama_cpp_suffix
|
1602 |
formatted_prompt2 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt2 + llama_cpp_suffix
|
1603 |
formatted_prompt3 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt3 + llama_cpp_suffix
|
@@ -1703,6 +1605,8 @@ def extract_topics(in_data_file,
|
|
1703 |
# Set to a very high number so as not to mess with subsequent file processing by the user
|
1704 |
#latest_batch_completed = 999
|
1705 |
|
|
|
|
|
1706 |
toc = time.perf_counter()
|
1707 |
final_time = (toc - tic) + time_taken
|
1708 |
out_time = f"Everything finished in {round(final_time,1)} seconds."
|
@@ -1733,6 +1637,7 @@ def extract_topics(in_data_file,
|
|
1733 |
## Reference table mapping response numbers to topics
|
1734 |
existing_reference_df.to_csv(reference_table_out_path, index=None)
|
1735 |
out_file_paths.append(reference_table_out_path)
|
|
|
1736 |
|
1737 |
# Create final unique topics table from reference table to ensure consistent numbers
|
1738 |
final_out_unique_topics_df = create_unique_table_df_from_reference_table(existing_reference_df)
|
@@ -1787,13 +1692,10 @@ def extract_topics(in_data_file,
|
|
1787 |
|
1788 |
print("latest_batch_completed at end of batch iterations to return is", latest_batch_completed)
|
1789 |
|
1790 |
-
return unique_table_df_display_table_markdown, existing_topics_table, final_out_unique_topics_df, existing_reference_df, final_out_file_paths, final_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, final_out_file_paths, final_out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), final_out_file_paths
|
1791 |
-
|
1792 |
-
|
1793 |
-
return unique_table_df_display_table_markdown, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths
|
1794 |
-
|
1795 |
|
1796 |
|
|
|
1797 |
|
1798 |
def convert_reference_table_to_pivot_table(df:pd.DataFrame, basic_response_data:pd.DataFrame=pd.DataFrame()):
|
1799 |
|
@@ -2302,7 +2204,7 @@ def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:
|
|
2302 |
whole_conversation_metadata = []
|
2303 |
|
2304 |
# Prepare Gemini models before query
|
2305 |
-
if
|
2306 |
print("Using Gemini model:", model_choice)
|
2307 |
model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
|
2308 |
else:
|
@@ -2464,7 +2366,7 @@ def summarise_output_topics(summarised_references:pd.DataFrame,
|
|
2464 |
if (model_choice == "gemma_2b_it_local") & (RUN_LOCAL_MODEL == "1"):
|
2465 |
progress(0.1, "Loading in Gemma 2b model")
|
2466 |
local_model, tokenizer = load_model()
|
2467 |
-
print("Local model loaded:", local_model)
|
2468 |
|
2469 |
summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
|
2470 |
summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")
|
|
|
19 |
|
20 |
GradioFileData = gr.FileData
|
21 |
|
22 |
+
from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, summarise_topic_descriptions_prompt, summarise_topic_descriptions_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, create_general_topics_system_prompt, create_general_topics_prompt, force_existing_topics_prompt, allow_new_topics_prompt, force_single_topic_prompt
|
23 |
+
from tools.helper_functions import output_folder, read_file, get_or_create_env_var, model_name_map, put_columns_in_df, wrap_text, initial_clean, load_in_data_file, load_in_file
|
24 |
from tools.chatfuncs import LlamaCPPGenerationConfig, call_llama_cpp_model, load_model, RUN_LOCAL_MODEL
|
25 |
|
26 |
# ResponseObject class for AWS Bedrock calls
|
|
|
59 |
|
60 |
return text
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
def load_in_previous_data_files(file_paths_partial_output:List[str], for_modified_table:bool=False):
|
63 |
'''Load in data table from a partially completed consultation summary to continue it.'''
|
64 |
|
|
|
130 |
|
131 |
return gr.Dataframe(value=unique_file_data, headers=None, col_count=(unique_file_data.shape[1], "fixed"), row_count = (unique_file_data.shape[0], "fixed"), visible=True, type="pandas"), reference_file_data, unique_file_data, reference_file_name, unique_file_name, out_file_names
|
132 |
|
133 |
+
|
134 |
def get_basic_response_data(file_data:pd.DataFrame, chosen_cols:List[str], verify_titles:bool=False) -> pd.DataFrame:
|
135 |
|
136 |
if not isinstance(chosen_cols, list):
|
|
|
144 |
if verify_titles == True:
|
145 |
basic_response_data = basic_response_data.rename(columns={chosen_cols[0]: "Response", chosen_cols[1]: "Title"})
|
146 |
basic_response_data["Title"] = basic_response_data["Title"].str.strip()
|
147 |
+
basic_response_data["Title"] = basic_response_data["Title"].apply(initial_clean)
|
148 |
else:
|
149 |
basic_response_data = basic_response_data.rename(columns={chosen_cols[0]: "Response"})
|
150 |
|
151 |
basic_response_data["Response"] = basic_response_data["Response"].str.strip()
|
152 |
+
basic_response_data["Response"] = basic_response_data["Response"].apply(initial_clean)
|
153 |
|
154 |
return basic_response_data
|
155 |
|
|
|
192 |
else:
|
193 |
end_row = file_len + 1
|
194 |
|
195 |
+
#print("start_row:", start_row)
|
196 |
+
#print("end_row:", end_row)
|
197 |
|
198 |
batch_basic_response_data = basic_response_data[start_row:end_row] # Select the current batch
|
199 |
|
200 |
+
#print("batch_basic_response_data:", batch_basic_response_data)
|
201 |
|
202 |
# Now replace the reference numbers with numbers starting from 1
|
203 |
batch_basic_response_data.loc[:, "Reference"] = batch_basic_response_data["Reference"] - start_row
|
|
|
345 |
|
346 |
# Now you can access both the text and metadata
|
347 |
#print("Text:", response.text)
|
348 |
+
#print("Metadata:", response.usage_metadata)
|
349 |
#print("Text:", response.text)
|
350 |
|
351 |
return response
|
|
|
375 |
progress_bar = range(0,number_of_api_retry_attempts)
|
376 |
|
377 |
# Generate the model's response
|
378 |
+
if "gemini" in model_choice:
|
379 |
|
380 |
for i in progress_bar:
|
381 |
try:
|
|
|
398 |
|
399 |
if i == number_of_api_retry_attempts:
|
400 |
return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
|
401 |
+
elif "anthropic.claude" in model_choice:
|
402 |
for i in progress_bar:
|
403 |
try:
|
404 |
print("Calling AWS Claude model, attempt", i + 1)
|
|
|
608 |
|
609 |
return result
|
610 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
611 |
def clean_column_name(column_name, max_length=20):
|
612 |
# Convert to string
|
613 |
column_name = str(column_name)
|
|
|
634 |
.assign(Topic_number=lambda df: np.arange(1, len(df) + 1)) # Add numbering 1 to x
|
635 |
)
|
636 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
637 |
return out_unique_topics_df
|
638 |
|
639 |
# Convert output table to markdown and then to a pandas dataframe to csv
|
|
|
791 |
call_temperature, reported_batch_no, local_model, master=master
|
792 |
)
|
793 |
|
|
|
|
|
794 |
if (model_choice != "gemma_local") & (model_choice != "gemma_2b_it_local"):
|
795 |
stripped_response = responses[-1].text.strip()
|
796 |
else:
|
|
|
897 |
return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
|
898 |
|
899 |
# Rename columns to ensure consistent use of data frames later in code
|
900 |
+
new_column_names = {
|
901 |
+
topic_with_response_df.columns[0]: "General Topic",
|
902 |
+
topic_with_response_df.columns[1]: "Subtopic",
|
903 |
+
topic_with_response_df.columns[2]: "Sentiment",
|
904 |
+
topic_with_response_df.columns[3]: "Response References",
|
905 |
+
topic_with_response_df.columns[4]: "Summary"
|
906 |
+
}
|
907 |
+
|
908 |
+
topic_with_response_df = topic_with_response_df.rename(columns=new_column_names)
|
909 |
+
|
910 |
|
911 |
# Fill in NA rows with values from above (topics seem to be included only on one row):
|
912 |
topic_with_response_df = topic_with_response_df.ffill()
|
|
|
938 |
sentiment = row.iloc[2] if pd.notna(row.iloc[2]) else ""
|
939 |
summary = row.iloc[4] if pd.notna(row.iloc[4]) else ""
|
940 |
# If the reference response column is very long, and there's nothing in the summary column, assume that the summary was put in the reference column
|
941 |
+
if not summary and len(str(row.iloc[3]) > 30):
|
942 |
+
summary = row.iloc[3]
|
943 |
|
944 |
summary = row_number_string_start + summary
|
945 |
|
|
|
1016 |
|
1017 |
return topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_with_response_df, markdown_table, out_reference_df, out_unique_topics_df, batch_file_path_details, is_error
|
1018 |
|
1019 |
+
def generate_zero_shot_topics_df(zero_shot_topics:pd.DataFrame,
|
1020 |
+
force_zero_shot_radio:str="No",
|
1021 |
+
create_revised_general_topics:bool=False,
|
1022 |
+
max_topic_no:int=120):
|
1023 |
+
|
1024 |
+
# Max 120 topics allowed
|
1025 |
+
if zero_shot_topics.shape[0] > max_topic_no:
|
1026 |
+
print("Maximum", max_topic_no, "topics allowed to fit within large language model context limits.")
|
1027 |
+
zero_shot_topics = zero_shot_topics.iloc[:max_topic_no, :]
|
1028 |
+
|
1029 |
+
# Forward slashes in the topic names seems to confuse the model
|
1030 |
+
if zero_shot_topics.shape[1] >= 1: # Check if there is at least one column
|
1031 |
+
for x in zero_shot_topics.columns:
|
1032 |
+
if not zero_shot_topics[x].isnull().all():
|
1033 |
+
zero_shot_topics[x] = zero_shot_topics[x].apply(initial_clean)
|
1034 |
+
|
1035 |
+
zero_shot_topics.loc[:, x] = (
|
1036 |
+
zero_shot_topics.loc[:, x]
|
1037 |
+
.str.strip()
|
1038 |
+
.str.replace('\n', ' ')
|
1039 |
+
.str.replace('\r', ' ')
|
1040 |
+
.str.replace('/', ' or ')
|
1041 |
+
.str.lower()
|
1042 |
+
.str.capitalize())
|
1043 |
+
|
1044 |
+
#print("zero_shot_topics:", zero_shot_topics)
|
1045 |
+
|
1046 |
+
# If number of columns is 1, keep only subtopics
|
1047 |
+
if zero_shot_topics.shape[1] == 1 and "General topic" not in zero_shot_topics.columns:
|
1048 |
+
zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
|
1049 |
+
zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
|
1050 |
+
# Allow for possibility that the user only wants to set general topics and not subtopics
|
1051 |
+
elif zero_shot_topics.shape[1] == 1 and "General topic" in zero_shot_topics.columns:
|
1052 |
+
zero_shot_topics_gen_topics_list = list(zero_shot_topics["General Topic"])
|
1053 |
+
zero_shot_topics_subtopics_list = [""] * zero_shot_topics.shape[0]
|
1054 |
+
# If general topic and subtopic are specified
|
1055 |
+
elif set(["General topic", "Subtopic"]).issubset(zero_shot_topics.columns):
|
1056 |
+
print("Found General topic and Subtopic in zero shot topics")
|
1057 |
+
zero_shot_topics_gen_topics_list = list(zero_shot_topics["General topic"])
|
1058 |
+
zero_shot_topics_subtopics_list = list(zero_shot_topics["Subtopic"])
|
1059 |
+
|
1060 |
+
# If number of columns is at least 2, keep general topics and subtopics
|
1061 |
+
elif zero_shot_topics.shape[1] >= 2 and "Description" not in zero_shot_topics.columns:
|
1062 |
+
zero_shot_topics_gen_topics_list = list(zero_shot_topics.iloc[:, 0])
|
1063 |
+
zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 1])
|
1064 |
+
else:
|
1065 |
+
# If there are more columns, just assume that the first column was meant to be a subtopic
|
1066 |
+
zero_shot_topics_gen_topics_list = [""] * zero_shot_topics.shape[0]
|
1067 |
+
zero_shot_topics_subtopics_list = list(zero_shot_topics.iloc[:, 0])
|
1068 |
+
|
1069 |
+
# Add a description if column is present
|
1070 |
+
# print("zero_shot_topics.shape[1]:", zero_shot_topics.shape[1])
|
1071 |
+
if "Description" in zero_shot_topics.columns:
|
1072 |
+
zero_shot_topics_description_list = list(zero_shot_topics["Description"])
|
1073 |
+
#print("Description found in topic title. List is:", zero_shot_topics_description_list)
|
1074 |
+
elif zero_shot_topics.shape[1] >= 3:
|
1075 |
+
zero_shot_topics_description_list = list(zero_shot_topics.iloc[:, 2]) # Assume the third column is description
|
1076 |
+
else:
|
1077 |
+
zero_shot_topics_description_list = [""] * zero_shot_topics.shape[0]
|
1078 |
+
|
1079 |
+
# If the responses are being forced into zero shot topics, allow an option for nothing relevant
|
1080 |
+
if force_zero_shot_radio == "Yes":
|
1081 |
+
zero_shot_topics_gen_topics_list.append("")
|
1082 |
+
zero_shot_topics_subtopics_list.append("No relevant topic")
|
1083 |
+
zero_shot_topics_description_list.append("")
|
1084 |
+
|
1085 |
+
if create_revised_general_topics == True:
|
1086 |
+
pass
|
1087 |
+
|
1088 |
+
# The following currently doesn't really work. Excluded for now.
|
1089 |
+
|
1090 |
+
# unique_topics_df = pd.DataFrame(data={
|
1091 |
+
# "General Topic":zero_shot_topics_gen_topics_list,
|
1092 |
+
# "Subtopic":zero_shot_topics_subtopics_list,
|
1093 |
+
# "Description": zero_shot_topics_description_list
|
1094 |
+
# })
|
1095 |
+
# unique_topics_markdown = unique_topics_df.to_markdown()
|
1096 |
+
|
1097 |
+
# #print("unique_topics_markdown:", unique_topics_markdown)
|
1098 |
+
|
1099 |
+
# formatted_general_topics_system_prompt = create_general_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
|
1100 |
+
|
1101 |
+
# # Format the general_topics prompt with the topics
|
1102 |
+
# formatted_general_topics_prompt = create_general_topics_prompt.format(topics=unique_topics_markdown)
|
1103 |
+
|
1104 |
+
# if "gemma" in model_choice:
|
1105 |
+
# formatted_general_topics_prompt = llama_cpp_prefix + formatted_general_topics_system_prompt + "\n" + formatted_general_topics_prompt + llama_cpp_suffix
|
1106 |
+
|
1107 |
+
# formatted_general_topics_prompt_list = [formatted_general_topics_prompt]
|
1108 |
+
|
1109 |
+
# whole_conversation = []
|
1110 |
+
|
1111 |
+
# general_topic_response, general_topic_conversation_history, general_topic_conversation, general_topic_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
|
1112 |
+
|
1113 |
+
# # Convert response text to a markdown table
|
1114 |
+
# try:
|
1115 |
+
# zero_shot_topics_df, is_error = convert_response_text_to_markdown_table(response_text, table_type = "Revised topics table")
|
1116 |
+
# print("Output revised zero shot topics table is:", zero_shot_topics_df)
|
1117 |
+
|
1118 |
+
# zero_shot_revised_path = output_folder + "zero_shot_topics_with_general_topics.csv"
|
1119 |
+
# out_file_paths.append(zero_shot_revised_path)
|
1120 |
+
|
1121 |
+
# except Exception as e:
|
1122 |
+
# print("Error in parsing markdown table from response text:", e, "Not adding revised General Topics to table")
|
1123 |
+
|
1124 |
+
# if zero_shot_topics_df.empty:
|
1125 |
+
# print("Creation of revised general topics df failed, reverting to original list")
|
1126 |
+
else:
|
1127 |
+
pass
|
1128 |
+
|
1129 |
+
# Add description or not
|
1130 |
+
zero_shot_topics_df = pd.DataFrame(data={
|
1131 |
+
"General Topic":zero_shot_topics_gen_topics_list,
|
1132 |
+
"Subtopic":zero_shot_topics_subtopics_list,
|
1133 |
+
"Description": zero_shot_topics_description_list
|
1134 |
+
})
|
1135 |
+
|
1136 |
+
#if not zero_shot_topics_df["Description"].isnull().all():
|
1137 |
+
# zero_shot_topics_df["Description"] = zero_shot_topics_df["Description"].apply(initial_clean)
|
1138 |
+
|
1139 |
+
return zero_shot_topics_df
|
1140 |
+
|
1141 |
@spaces.GPU
|
1142 |
def extract_topics(in_data_file,
|
1143 |
file_data:pd.DataFrame,
|
|
|
1171 |
sentiment_checkbox:str = "Negative, Neutral, or Positive",
|
1172 |
force_zero_shot_radio:str = "No",
|
1173 |
in_excel_sheets:List[str] = [],
|
1174 |
+
force_single_topic_radio:str = "No",
|
1175 |
+
force_single_topic_prompt:str=force_single_topic_prompt,
|
1176 |
max_tokens:int=max_tokens,
|
1177 |
model_name_map:dict=model_name_map,
|
1178 |
max_time_for_loop:int=max_time_for_loop,
|
|
|
1213 |
- time_taken (float, optional): The amount of time taken to process the responses up until this point.
|
1214 |
- sentiment_checkbox (str, optional): What type of sentiment analysis should the topic modeller do?
|
1215 |
- force_zero_shot_radio (str, optional): Should responses be forced into a zero shot topic or not.
|
1216 |
+
- in_excel_sheets (List[str], optional): List of excel sheets to load from input file.
|
1217 |
+
- force_single_topic_radio (str, optional): Should the model be forced to assign only one single topic to each response (effectively a classifier).
|
1218 |
+
- force_single_topic_prompt (str, optional): The prompt for forcing the model to assign only one single topic to each response.
|
1219 |
- max_tokens (int): The maximum number of tokens for the model.
|
1220 |
- model_name_map (dict, optional): A dictionary mapping full model name to shortened.
|
1221 |
- max_time_for_loop (int, optional): The number of seconds maximum that the function should run for before breaking (to run again, this is to avoid timeouts with some AWS services if deployed there).
|
|
|
1245 |
if file_data.empty:
|
1246 |
print("No data table found, loading from file")
|
1247 |
try:
|
|
|
1248 |
in_colnames_drop, in_excel_sheets, file_name = put_columns_in_df(in_data_file)
|
|
|
1249 |
file_data, file_name, num_batches = load_in_data_file(in_data_file, chosen_cols, batch_size_default, in_excel_sheets)
|
|
|
1250 |
except:
|
1251 |
# Check if files and text exist
|
1252 |
out_message = "Please enter a data file to summarise."
|
1253 |
print(out_message)
|
1254 |
raise Exception(out_message)
|
|
|
1255 |
|
1256 |
|
1257 |
#model_choice_clean = replace_punctuation_with_underscore(model_choice)
|
|
|
1264 |
latest_batch_completed = 0
|
1265 |
out_message = []
|
1266 |
out_file_paths = []
|
|
|
1267 |
|
1268 |
if (model_choice == "gemma_2b_it_local") & (RUN_LOCAL_MODEL == "1"):
|
1269 |
progress(0.1, "Loading in Gemma 2b model")
|
1270 |
local_model, tokenizer = load_model()
|
|
|
1271 |
|
1272 |
if num_batches > 0:
|
1273 |
progress_measure = round(latest_batch_completed / num_batches, 1)
|
|
|
1286 |
out_file_paths = []
|
1287 |
|
1288 |
|
1289 |
+
if "anthropic.claude-3-sonnet" in model_choice and file_data.shape[1] > 300:
|
1290 |
out_message = "Your data has more than 300 rows, using the Sonnet model will be too expensive. Please choose the Haiku model instead."
|
1291 |
print(out_message)
|
1292 |
+
raise Exception(out_message)
|
|
|
|
|
1293 |
|
1294 |
if sentiment_checkbox == "Negative, Neutral, or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative, Neutral, or Positive"
|
1295 |
elif sentiment_checkbox == "Negative or Positive": sentiment_prompt = "In the third column, write the sentiment of the Subtopic: Negative or Positive"
|
|
|
1320 |
if latest_batch_completed >= 1 or candidate_topics is not None:
|
1321 |
|
1322 |
# Prepare Gemini models before query
|
1323 |
+
if "gemini" in model_choice:
|
1324 |
print("Using Gemini model:", model_choice)
|
1325 |
model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=add_existing_topics_system_prompt, max_tokens=max_tokens)
|
1326 |
+
elif "anthropic.claude" in model_choice:
|
1327 |
print("Using AWS Bedrock model:", model_choice)
|
1328 |
else:
|
1329 |
print("Using local model:", model_choice)
|
|
|
1334 |
|
1335 |
# 'Zero shot topics' are those supplied by the user
|
1336 |
max_topic_no = 120
|
1337 |
+
zero_shot_topics = read_file(candidate_topics.name)
|
1338 |
+
|
1339 |
+
zero_shot_topics_df = generate_zero_shot_topics_df(zero_shot_topics, force_zero_shot_radio, create_revised_general_topics, max_topic_no)
|
1340 |
+
|
1341 |
+
#print("zero_shot_topics_df:", zero_shot_topics_df)
|
1342 |
+
|
1343 |
+
# This part concatenates all zero shot and new topics together, so that for the next prompt the LLM will have the full list available
|
1344 |
+
if not existing_unique_topics_df.empty and force_zero_shot_radio != "Yes":
|
1345 |
+
existing_unique_topics_df = pd.concat([existing_unique_topics_df, zero_shot_topics_df]).drop_duplicates("Subtopic")
|
1346 |
+
else:
|
1347 |
+
existing_unique_topics_df = zero_shot_topics_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1348 |
|
1349 |
if candidate_topics and not zero_shot_topics_df.empty:
|
1350 |
# If you have already created revised zero shot topics, concat to the current
|
|
|
1355 |
existing_unique_topics_df.fillna("", inplace=True)
|
1356 |
existing_unique_topics_df["General Topic"] = existing_unique_topics_df["General Topic"].str.replace('(?i)^Nan$', '', regex=True)
|
1357 |
existing_unique_topics_df["Subtopic"] = existing_unique_topics_df["Subtopic"].str.replace('(?i)^Nan$', '', regex=True)
|
1358 |
+
existing_unique_topics_df = existing_unique_topics_df.drop_duplicates()
|
1359 |
+
if "Description" in existing_unique_topics_df:
|
1360 |
+
if existing_unique_topics_df['Description'].isnull().all():
|
1361 |
+
existing_unique_topics_df.drop("Description", axis = 1, inplace = True)
|
1362 |
|
1363 |
# print("existing_unique_topics_df:", existing_unique_topics_df)
|
1364 |
|
1365 |
# If user has chosen to try to force zero shot topics, then the prompt is changed to ask the model not to deviate at all from submitted topic list.
|
1366 |
+
keep_cols = [
|
1367 |
+
col for col in ["General Topic", "Subtopic", "Description"]
|
1368 |
+
if col in existing_unique_topics_df.columns
|
1369 |
+
and not existing_unique_topics_df[col].replace(r'^\s*$', pd.NA, regex=True).isna().all()
|
1370 |
+
]
|
1371 |
+
|
1372 |
+
if force_zero_shot_radio == "Yes":
|
1373 |
+
topics_df_for_markdown = existing_unique_topics_df[keep_cols].drop_duplicates(keep_cols)
|
1374 |
+
unique_topics_markdown = topics_df_for_markdown.to_markdown(index=False)
|
1375 |
topic_assignment_prompt = force_existing_topics_prompt
|
1376 |
else:
|
1377 |
+
topics_df_for_markdown = existing_unique_topics_df[keep_cols].drop_duplicates(keep_cols)
|
1378 |
+
unique_topics_markdown = topics_df_for_markdown.to_markdown(index=False)
|
1379 |
+
topic_assignment_prompt = allow_new_topics_prompt
|
1380 |
+
|
1381 |
+
# Should the outputs force only one single topic assignment per response?
|
1382 |
+
if force_single_topic_radio != "Yes": force_single_topic_prompt = ""
|
1383 |
+
else:
|
1384 |
+
topic_assignment_prompt = topic_assignment_prompt.replace("Assign topics", "Assign a topic").replace("assign Subtopics", "assign a Subtopic").replace("Subtopics", "Subtopic").replace("Topics", "Topic").replace("topics", "a topic")
|
1385 |
|
1386 |
# Format the summary prompt with the response table and topics
|
1387 |
formatted_system_prompt = add_existing_topics_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
|
1388 |
+
formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown, topic_assignment=topic_assignment_prompt, force_single_topic=force_single_topic_prompt, sentiment_choices=sentiment_prompt)
|
1389 |
|
1390 |
|
1391 |
+
if "gemma" in model_choice:
|
1392 |
formatted_summary_prompt = llama_cpp_prefix + formatted_system_prompt + "\n" + formatted_summary_prompt + llama_cpp_suffix
|
1393 |
full_prompt = formatted_summary_prompt
|
1394 |
else:
|
|
|
1406 |
except Exception as e:
|
1407 |
print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
|
1408 |
|
1409 |
+
if "gemma" in model_choice:
|
1410 |
summary_prompt_list = [full_prompt] # Includes system prompt
|
1411 |
else:
|
1412 |
summary_prompt_list = [formatted_summary_prompt]
|
|
|
1417 |
whole_conversation = []
|
1418 |
|
1419 |
# Process requests to large language model
|
|
|
|
|
1420 |
responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, model, config, model_choice, temperature, reported_batch_no, local_model, MAX_OUTPUT_VALIDATION_ATTEMPTS, master = True)
|
1421 |
|
1422 |
+
# Return output tables
|
|
|
|
|
1423 |
topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, new_topic_df, new_markdown_table, new_reference_df, new_unique_topics_df, master_batch_out_file_part, is_error = write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, first_run=False)
|
1424 |
|
1425 |
# Write final output to text file for logging purposes
|
|
|
1444 |
if is_error == True:
|
1445 |
final_message_out = "Could not complete summary, error in LLM output."
|
1446 |
raise Exception(final_message_out)
|
|
|
1447 |
|
1448 |
# Write outputs to csv
|
1449 |
## Topics with references
|
|
|
1462 |
|
1463 |
# Outputs for markdown table output
|
1464 |
unique_table_df_display_table = new_unique_topics_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
|
1465 |
+
unique_table_df_display_table_markdown = unique_table_df_display_table[["General Topic", "Subtopic", "Sentiment", "Response References", "Summary"]].to_markdown(index=False)
|
1466 |
|
1467 |
#whole_conversation_metadata.append(whole_conversation_metadata_str)
|
1468 |
whole_conversation_metadata_str = ' '.join(whole_conversation_metadata)
|
|
|
1481 |
#system_prompt = system_prompt + normalised_simple_markdown_table
|
1482 |
|
1483 |
# Prepare Gemini models before query
|
1484 |
+
if "gemini" in model_choice:
|
1485 |
print("Using Gemini model:", model_choice)
|
1486 |
model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
|
1487 |
+
elif "gemma" in model_choice:
|
1488 |
+
print("Using local Gemma model:", model_choice)
|
1489 |
else:
|
1490 |
print("Using AWS Bedrock model:", model_choice)
|
1491 |
|
|
|
1499 |
if prompt3: formatted_prompt3 = prompt3.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt)
|
1500 |
else: formatted_prompt3 = prompt3
|
1501 |
|
1502 |
+
if "gemma" in model_choice:
|
1503 |
formatted_initial_table_prompt = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_initial_table_prompt + llama_cpp_suffix
|
1504 |
formatted_prompt2 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt2 + llama_cpp_suffix
|
1505 |
formatted_prompt3 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt3 + llama_cpp_suffix
|
|
|
1605 |
# Set to a very high number so as not to mess with subsequent file processing by the user
|
1606 |
#latest_batch_completed = 999
|
1607 |
|
1608 |
+
join_file_paths = []
|
1609 |
+
|
1610 |
toc = time.perf_counter()
|
1611 |
final_time = (toc - tic) + time_taken
|
1612 |
out_time = f"Everything finished in {round(final_time,1)} seconds."
|
|
|
1637 |
## Reference table mapping response numbers to topics
|
1638 |
existing_reference_df.to_csv(reference_table_out_path, index=None)
|
1639 |
out_file_paths.append(reference_table_out_path)
|
1640 |
+
join_file_paths.append(reference_table_out_path)
|
1641 |
|
1642 |
# Create final unique topics table from reference table to ensure consistent numbers
|
1643 |
final_out_unique_topics_df = create_unique_table_df_from_reference_table(existing_reference_df)
|
|
|
1692 |
|
1693 |
print("latest_batch_completed at end of batch iterations to return is", latest_batch_completed)
|
1694 |
|
1695 |
+
return unique_table_df_display_table_markdown, existing_topics_table, final_out_unique_topics_df, existing_reference_df, final_out_file_paths, final_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, final_out_file_paths, final_out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), final_out_file_paths, join_file_paths
|
|
|
|
|
|
|
|
|
1696 |
|
1697 |
|
1698 |
+
return unique_table_df_display_table_markdown, existing_topics_table, existing_unique_topics_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, gr.Dataframe(value=modifiable_unique_topics_df, headers=None, col_count=(modifiable_unique_topics_df.shape[1], "fixed"), row_count = (modifiable_unique_topics_df.shape[0], "fixed"), visible=True, type="pandas"), out_file_paths, join_file_paths
|
1699 |
|
1700 |
def convert_reference_table_to_pivot_table(df:pd.DataFrame, basic_response_data:pd.DataFrame=pd.DataFrame()):
|
1701 |
|
|
|
2204 |
whole_conversation_metadata = []
|
2205 |
|
2206 |
# Prepare Gemini models before query
|
2207 |
+
if "gemini" in model_choice:
|
2208 |
print("Using Gemini model:", model_choice)
|
2209 |
model, config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=system_prompt, max_tokens=max_tokens)
|
2210 |
else:
|
|
|
2366 |
if (model_choice == "gemma_2b_it_local") & (RUN_LOCAL_MODEL == "1"):
|
2367 |
progress(0.1, "Loading in Gemma 2b model")
|
2368 |
local_model, tokenizer = load_model()
|
2369 |
+
#print("Local model loaded:", local_model)
|
2370 |
|
2371 |
summary_loop_description = "Creating summaries. " + str(latest_summary_completed) + " summaries completed so far."
|
2372 |
summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Creating summaries", unit="summaries")
|
tools/prompts.py
CHANGED
@@ -29,14 +29,16 @@ In the first column, write 'Not assessed'. In the second column, assign Subtopic
|
|
29 |
allow_new_topics_prompt = """Create a new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response References', and 'Summary'.
|
30 |
In the first and second columns, assign General Topics and Subtopics to Responses. Assign topics from the Topics table above only if they are very relevant to the text of the Response. Fill in the General Topic and Sentiment for the Subtopic if they do not already exist. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General Topic and Subtopic as specific as possible. The subtopic should never be blank or empty."""
|
31 |
|
|
|
|
|
32 |
add_existing_topics_prompt = """Responses are shown in the following Response table:
|
33 |
{response_table}
|
34 |
|
35 |
Topics known to be relevant to this dataset are shown in the following Topics table:
|
36 |
{topics}
|
37 |
|
38 |
-
Your task is to create one new markdown table, assigning responses from the Response table to
|
39 |
-
{topic_assignment}
|
40 |
{sentiment_choices}.
|
41 |
In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
|
42 |
In the fifth column, write a short summary of the Subtopic based on relevant responses - highlight specific issues that appear.
|
@@ -46,7 +48,6 @@ New table:"""
|
|
46 |
|
47 |
# Return only one table in markdown format containing all relevant topics. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
|
48 |
|
49 |
-
|
50 |
summarise_topic_descriptions_system_prompt = system_prompt
|
51 |
|
52 |
summarise_topic_descriptions_prompt = """Below is a table with number of paragraphs related to the data from the open text column:
|
@@ -57,6 +58,10 @@ Your task is to make a consolidated summary of the above text. {summary_format}.
|
|
57 |
|
58 |
Summary:"""
|
59 |
|
|
|
|
|
|
|
|
|
60 |
|
61 |
## The following didn't work well in testing and so is not currently used
|
62 |
|
@@ -74,16 +79,16 @@ New Topics table:"""
|
|
74 |
verify_titles_system_prompt = system_prompt
|
75 |
|
76 |
|
77 |
-
verify_titles_prompt = """Response numbers alongside the Response text and assigned
|
78 |
{response_table}
|
79 |
|
80 |
-
The criteria for a suitable
|
81 |
|
82 |
Create a markdown table with four columns.
|
83 |
The first column is 'Response References', and should contain just the response number under consideration.
|
84 |
-
The second column is 'Is this a suitable
|
85 |
The third column is 'Explanation', give a short explanation for your response in the second column.
|
86 |
-
The fourth column is 'Alternative
|
87 |
Do not add any other text to your response.
|
88 |
|
89 |
Output markdown table:"""
|
|
|
29 |
allow_new_topics_prompt = """Create a new markdown table with the headings 'General Topic', 'Subtopic', 'Sentiment', 'Response References', and 'Summary'.
|
30 |
In the first and second columns, assign General Topics and Subtopics to Responses. Assign topics from the Topics table above only if they are very relevant to the text of the Response. Fill in the General Topic and Sentiment for the Subtopic if they do not already exist. If you find a new topic that does not exist in the Topics table, add a new row to the new table. Make the General Topic and Subtopic as specific as possible. The subtopic should never be blank or empty."""
|
31 |
|
32 |
+
force_single_topic_prompt = """ Wherever possible, assign a response to one single topic, unless there are multiple topics that are equally relevant."""
|
33 |
+
|
34 |
add_existing_topics_prompt = """Responses are shown in the following Response table:
|
35 |
{response_table}
|
36 |
|
37 |
Topics known to be relevant to this dataset are shown in the following Topics table:
|
38 |
{topics}
|
39 |
|
40 |
+
Your task is to create one new markdown table, assigning responses from the Response table to topics.
|
41 |
+
{topic_assignment}{force_single_topic}
|
42 |
{sentiment_choices}.
|
43 |
In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
|
44 |
In the fifth column, write a short summary of the Subtopic based on relevant responses - highlight specific issues that appear.
|
|
|
48 |
|
49 |
# Return only one table in markdown format containing all relevant topics. Remove topics from the table that are not assigned to any response. Do not repeat Subtopics with the same Sentiment.
|
50 |
|
|
|
51 |
summarise_topic_descriptions_system_prompt = system_prompt
|
52 |
|
53 |
summarise_topic_descriptions_prompt = """Below is a table with number of paragraphs related to the data from the open text column:
|
|
|
58 |
|
59 |
Summary:"""
|
60 |
|
61 |
+
single_para_summary_format_prompt = "Return a concise summary up to one paragraph long that summarises only the most important themes from the original text"
|
62 |
+
|
63 |
+
two_para_summary_format_prompt = "Return a summary up to two paragraphs long that includes as much detail as possible from the original text"
|
64 |
+
|
65 |
|
66 |
## The following didn't work well in testing and so is not currently used
|
67 |
|
|
|
79 |
verify_titles_system_prompt = system_prompt
|
80 |
|
81 |
|
82 |
+
verify_titles_prompt = """Response numbers alongside the Response text and assigned descriptions are shown in the table below:
|
83 |
{response_table}
|
84 |
|
85 |
+
The criteria for a suitable description for these responses is that they should be readable, concise, and fully encapsulate the main subject of the response.
|
86 |
|
87 |
Create a markdown table with four columns.
|
88 |
The first column is 'Response References', and should contain just the response number under consideration.
|
89 |
+
The second column is 'Is this a suitable description', answer the question with 'Yes' or 'No', with no other text.
|
90 |
The third column is 'Explanation', give a short explanation for your response in the second column.
|
91 |
+
The fourth column is 'Alternative description', suggest an alternative description for the response that meet the criteria stated above.
|
92 |
Do not add any other text to your response.
|
93 |
|
94 |
Output markdown table:"""
|