Spaces:

seanpedrickcase
/

llm_topic_modelling

Running on Zero

App Files Files Community

seanpedrickcase commited on Sep 1

Commit

c61bb70

1 Parent(s): 49faa78

Added GPT-OSS 20b support. Moved to Llama cpp python chat_completion function

Browse files

Files changed (8) hide show

app.py +17 -21
requirements_gpu.txt +2 -2
tools/config.py +32 -13
tools/custom_csvlogger.py +4 -7
tools/dedup_summaries.py +1 -1
tools/llm_api_call.py +82 -71
tools/llm_funcs.py +91 -44
tools/verify_titles.py +14 -8

app.py CHANGED Viewed

@@ -109,7 +109,7 @@ with app:
     master_unique_topics_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_revised_summaries_state", visible=False, type="pandas")
     summarised_output_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="summarised_output_df", visible=False, type="pandas")
     summarised_references_markdown = gr.Markdown("", visible=False)
-    summarised_outputs_list = gr.Dropdown(value=[], choices=[], visible=False, label="List of summarised outputs", allow_custom_value=True)
     latest_summary_completed_num = gr.Number(0, visible=False)
     original_data_file_name_textbox = gr.Textbox(label = "Reference data file name", value="", visible=False)
@@ -147,7 +147,7 @@ with app:
         gr.Markdown("""### Choose a tabular data file (xlsx or csv) of open text to extract topics from.""")
         with gr.Row():
             model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model", multiselect=False)
-            in_api_key = gr.Textbox(value = GEMINI_API_KEY, label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
         with gr.Accordion("Upload xlsx or csv file", open = True):
             in_data_files = gr.File(height=FILE_INPUT_HEIGHT, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
@@ -308,6 +308,9 @@ with app:
             aws_access_key_textbox = gr.Textbox(label="AWS access key", interactive=False, lines=1, type="password")
             aws_secret_key_textbox = gr.Textbox(label="AWS secret key", interactive=False, lines=1, type="password")
         # Invisible text box to hold the session hash/username just for logging purposes
         session_hash_textbox = gr.Textbox(label = "Session hash", value="", visible=False)
@@ -315,19 +318,6 @@ with app:
         total_number_of_batches = gr.Number(label = "Current batch number", value = 1, precision=0, visible=False)
         text_output_logs = gr.Textbox(label = "Output summary logs", visible=False)
-    # AWS options - not yet implemented
-    # with gr.Tab(label="Advanced options"):
-    #     with gr.Accordion(label = "AWS data access", open = True):
-    #         aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
-    #         with gr.Row():
-    #             in_aws_file = gr.Dropdown(label="Choose file to load from AWS (only valid for API Gateway app)", choices=["None", "Lambeth borough plan"])
-    #             load_aws_data_button = gr.Button(value="Load data from AWS", variant="secondary")
-    #         aws_log_box = gr.Textbox(label="AWS data load status")
-    # ### Loading AWS data ###
-    # load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
     ###
     # INTERACTIVE ELEMENT FUNCTIONS
@@ -364,7 +354,7 @@ with app:
                 display_topic_table_markdown,
                 original_data_file_name_textbox,
                 total_number_of_batches,
-                in_api_key,
                 temperature_slide,
                 in_colnames,
                 model_choice,
@@ -411,7 +401,7 @@ with app:
                 output_tokens_num,
                 number_of_calls_num],
                 api_name="extract_topics")
     ###
     # DEDUPLICATION AND SUMMARISATION FUNCTIONS
     ###
@@ -430,14 +420,14 @@ with app:
     success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
         success(load_in_previous_data_files, inputs=[summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, working_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
             success(sample_reference_table_summaries, inputs=[master_reference_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown], api_name="sample_summaries").\
-                success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, working_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state, context_textbox, aws_access_key_textbox, aws_secret_key_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number], api_name="summarise_topics")
-    # latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, in_api_key, temperature_slide, working_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state, context_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files, input_tokens_num, output_tokens_num, number_of_calls_num], scroll_to_output=True)
     # SUMMARISE WHOLE TABLE PAGE
     overall_summarise_previous_data_btn.click(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
             success(load_in_previous_data_files, inputs=[overall_summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, working_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
-            success(overall_summary, inputs=[master_unique_topics_df_state, model_choice, in_api_key, temperature_slide, unique_topics_table_file_name_textbox, output_folder_state, in_colnames, context_textbox, aws_access_key_textbox, aws_secret_key_textbox], outputs=[overall_summary_output_files, overall_summarised_output_markdown, summarised_output_df, conversation_metadata_textbox, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number], scroll_to_output=True, api_name="overall_summary")
     ###
     # CONTINUE PREVIOUS TOPIC EXTRACTION PAGE
@@ -512,7 +502,13 @@ with app:
     usage_callback.setup([session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox, input_tokens_num,
                 output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], USAGE_LOGS_FOLDER)
-    conversation_metadata_textbox.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,  dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False, api_name="usage_logs").\
         success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state, s3_log_bucket_name, aws_access_key_textbox, aws_secret_key_textbox], outputs=[s3_logs_output_textbox])
     # User submitted feedback

     master_unique_topics_df_revised_summaries_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="master_unique_topics_df_revised_summaries_state", visible=False, type="pandas")
     summarised_output_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="summarised_output_df", visible=False, type="pandas")
     summarised_references_markdown = gr.Markdown("", visible=False)
+    summarised_outputs_list = gr.Dropdown(value= list(), choices= list(), visible=False, label="List of summarised outputs", allow_custom_value=True)
     latest_summary_completed_num = gr.Number(0, visible=False)
     original_data_file_name_textbox = gr.Textbox(label = "Reference data file name", value="", visible=False)
         gr.Markdown("""### Choose a tabular data file (xlsx or csv) of open text to extract topics from.""")
         with gr.Row():
             model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model", multiselect=False)
         with gr.Accordion("Upload xlsx or csv file", open = True):
             in_data_files = gr.File(height=FILE_INPUT_HEIGHT, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'])
             aws_access_key_textbox = gr.Textbox(label="AWS access key", interactive=False, lines=1, type="password")
             aws_secret_key_textbox = gr.Textbox(label="AWS secret key", interactive=False, lines=1, type="password")
+        with gr.Accordion("Enter Gemini API keys", open = False):
+            google_api_key_textbox = gr.Textbox(value = GEMINI_API_KEY, label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
         # Invisible text box to hold the session hash/username just for logging purposes
         session_hash_textbox = gr.Textbox(label = "Session hash", value="", visible=False)
         total_number_of_batches = gr.Number(label = "Current batch number", value = 1, precision=0, visible=False)
         text_output_logs = gr.Textbox(label = "Output summary logs", visible=False)
     ###
     # INTERACTIVE ELEMENT FUNCTIONS
                 display_topic_table_markdown,
                 original_data_file_name_textbox,
                 total_number_of_batches,
+                google_api_key_textbox,
                 temperature_slide,
                 in_colnames,
                 model_choice,
                 output_tokens_num,
                 number_of_calls_num],
                 api_name="extract_topics")
     ###
     # DEDUPLICATION AND SUMMARISATION FUNCTIONS
     ###
     success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
         success(load_in_previous_data_files, inputs=[summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, working_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
             success(sample_reference_table_summaries, inputs=[master_reference_df_state, random_seed], outputs=[summary_reference_table_sample_state, summarised_references_markdown], api_name="sample_summaries").\
+                success(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, google_api_key_textbox, temperature_slide, working_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state, context_textbox, aws_access_key_textbox, aws_secret_key_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number], api_name="summarise_topics")
+    # latest_summary_completed_num.change(summarise_output_topics, inputs=[summary_reference_table_sample_state, master_unique_topics_df_state, master_reference_df_state, model_choice, google_api_key_textbox, temperature_slide, working_data_file_name_textbox, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, in_data_files, in_excel_sheets, in_colnames, log_files_output_list_state, summarise_format_radio, output_folder_state, context_textbox], outputs=[summary_reference_table_sample_state, master_unique_topics_df_revised_summaries_state, master_reference_df_revised_summaries_state, summary_output_files, summarised_outputs_list, latest_summary_completed_num, conversation_metadata_textbox, summarised_output_markdown, log_files_output, overall_summarisation_input_files, input_tokens_num, output_tokens_num, number_of_calls_num], scroll_to_output=True)
     # SUMMARISE WHOLE TABLE PAGE
     overall_summarise_previous_data_btn.click(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
             success(load_in_previous_data_files, inputs=[overall_summarisation_input_files], outputs=[master_reference_df_state, master_unique_topics_df_state, latest_batch_completed_no_loop, deduplication_input_files_status, working_data_file_name_textbox, unique_topics_table_file_name_textbox]).\
+            success(overall_summary, inputs=[master_unique_topics_df_state, model_choice, google_api_key_textbox, temperature_slide, unique_topics_table_file_name_textbox, output_folder_state, in_colnames, context_textbox, aws_access_key_textbox, aws_secret_key_textbox], outputs=[overall_summary_output_files, overall_summarised_output_markdown, summarised_output_df, conversation_metadata_textbox, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number], scroll_to_output=True, api_name="overall_summary")
     ###
     # CONTINUE PREVIOUS TOPIC EXTRACTION PAGE
     usage_callback.setup([session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox, input_tokens_num,
                 output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], USAGE_LOGS_FOLDER)
+    def conversation_metadata_textbox_change(textbox_value):
+        print("conversation_metadata_textbox_change:", textbox_value)
+        return textbox_value
+    number_of_calls_num.change(conversation_metadata_textbox_change, inputs=[conversation_metadata_textbox], outputs=[conversation_metadata_textbox])
+    number_of_calls_num.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,  dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False, api_name="usage_logs").\
         success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state, s3_log_bucket_name, aws_access_key_textbox, aws_secret_key_textbox], outputs=[s3_logs_output_textbox])
     # User submitted feedback

requirements_gpu.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 pandas==2.3.1
-gradio==5.42.0
 transformers==4.55.2
 spaces==0.40.0
 boto3==1.40.11
@@ -17,7 +17,7 @@ python-dotenv==1.1.0
 # Torch and Llama CPP Python
 torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu124 # Latest compatible with CUDA 12.4
 # For Linux:
-#https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp311-cp311-linux_x86_64.whl
 # For Windows:
 #llama-cpp-python==0.3.16 -C cmake.args="-DGGML_CUDA=on" --verbose
 # If above doesn't work for Windows, try looking at'windows_install_llama-cpp-python.txt'

 pandas==2.3.1
+gradio==5.44.0
 transformers==4.55.2
 spaces==0.40.0
 boto3==1.40.11
 # Torch and Llama CPP Python
 torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu124 # Latest compatible with CUDA 12.4
 # For Linux:
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp311-cp311-linux_x86_64.whl
 # For Windows:
 #llama-cpp-python==0.3.16 -C cmake.args="-DGGML_CUDA=on" --verbose
 # If above doesn't work for Windows, try looking at'windows_install_llama-cpp-python.txt'

tools/config.py CHANGED Viewed

@@ -222,7 +222,7 @@ model_full_names = []
 model_short_names = []
 model_source = []
-CHOSEN_LOCAL_MODEL_TYPE = get_or_create_env_var("CHOSEN_LOCAL_MODEL_TYPE", "Gemma 3 4B") # Gemma 3 1B #  "Gemma 2b"
 if RUN_LOCAL_MODEL == "1" and CHOSEN_LOCAL_MODEL_TYPE:
     model_full_names.append(CHOSEN_LOCAL_MODEL_TYPE)
@@ -252,18 +252,28 @@ model_name_map = {
 # HF token may or may not be needed for downloading models from Hugging Face
 HF_TOKEN = get_or_create_env_var('HF_TOKEN', '')
-GEMMA2_REPO_ID = get_or_create_env_var("GEMMA2_2B_REPO_ID", "lmstudio-community/gemma-2-2b-it-GGUF")# "bartowski/Llama-3.2-3B-Instruct-GGUF") # "lmstudio-community/gemma-2-2b-it-GGUF")#"QuantFactory/Phi-3-mini-128k-instruct-GGUF")
-GEMMA2_MODEL_FILE = get_or_create_env_var("GEMMA2_2B_MODEL_FILE", "gemma-2-2b-it-Q8_0.gguf") # )"Llama-3.2-3B-Instruct-Q5_K_M.gguf") #"gemma-2-2b-it-Q8_0.gguf") #"Phi-3-mini-128k-instruct.Q4_K_M.gguf")
-GEMMA2_MODEL_FOLDER = get_or_create_env_var("GEMMA2_2B_MODEL_FOLDER", "model/gemma") #"model/phi"  # Assuming this is your intended directory
-GEMMA3_REPO_ID = get_or_create_env_var("GEMMA3_REPO_ID", "ggml-org/gemma-3-1b-it-GGUF")# "bartowski/Llama-3.2-3B-Instruct-GGUF") # "lmstudio-community/gemma-2-2b-it-GGUF")#"QuantFactory/Phi-3-mini-128k-instruct-GGUF")
-GEMMA3_MODEL_FILE = get_or_create_env_var("GEMMA3_MODEL_FILE", "gemma-3-1b-it-Q8_0.gguf") # )"Llama-3.2-3B-Instruct-Q5_K_M.gguf") #"gemma-2-2b-it-Q8_0.gguf") #"Phi-3-mini-128k-instruct.Q4_K_M.gguf")
 GEMMA3_MODEL_FOLDER = get_or_create_env_var("GEMMA3_MODEL_FOLDER", "model/gemma")
-GEMMA3_4B_REPO_ID = get_or_create_env_var("GEMMA3_4B_REPO_ID", "ggml-org/gemma-3-4b-it-GGUF")# "bartowski/Llama-3.2-3B-Instruct-GGUF") # "lmstudio-community/gemma-2-2b-it-GGUF")#"QuantFactory/Phi-3-mini-128k-instruct-GGUF")
-GEMMA3_4B_MODEL_FILE = get_or_create_env_var("GEMMA3_4B_MODEL_FILE", "gemma-3-4b-it-Q4_K_M.gguf") # )"Llama-3.2-3B-Instruct-Q5_K_M.gguf") #"gemma-2-2b-it-Q8_0.gguf") #"Phi-3-mini-128k-instruct.Q4_K_M.gguf")
 GEMMA3_4B_MODEL_FOLDER = get_or_create_env_var("GEMMA3_4B_MODEL_FOLDER", "model/gemma3_4b")
 if CHOSEN_LOCAL_MODEL_TYPE == "Gemma 2b":
     LOCAL_REPO_ID = GEMMA2_REPO_ID
     LOCAL_MODEL_FILE = GEMMA2_MODEL_FILE
@@ -280,25 +290,34 @@ elif CHOSEN_LOCAL_MODEL_TYPE == "Gemma 3 4B":
     LOCAL_MODEL_FILE = GEMMA3_4B_MODEL_FILE
     LOCAL_MODEL_FOLDER = GEMMA3_4B_MODEL_FOLDER
     print("CHOSEN_LOCAL_MODEL_TYPE:", CHOSEN_LOCAL_MODEL_TYPE)
     print("LOCAL_REPO_ID:", LOCAL_REPO_ID)
     print("LOCAL_MODEL_FILE:", LOCAL_MODEL_FILE)
     print("LOCAL_MODEL_FOLDER:", LOCAL_MODEL_FOLDER)
-LLM_MAX_GPU_LAYERS = int(get_or_create_env_var('LLM_MAX_GPU_LAYERS','-1'))
 LLM_TEMPERATURE = float(get_or_create_env_var('LLM_TEMPERATURE', '0.1'))
-LLM_TOP_K = int(get_or_create_env_var('LLM_TOP_K','3'))
-LLM_TOP_P = float(get_or_create_env_var('LLM_TOP_P', '1'))
-LLM_REPETITION_PENALTY = float(get_or_create_env_var('LLM_REPETITION_PENALTY', '1.2')) # Mild repetition penalty to prevent repeating table rows
 LLM_LAST_N_TOKENS = int(get_or_create_env_var('LLM_LAST_N_TOKENS', '512'))
 LLM_MAX_NEW_TOKENS = int(get_or_create_env_var('LLM_MAX_NEW_TOKENS', '4096'))
 LLM_SEED = int(get_or_create_env_var('LLM_SEED', '42'))
 LLM_RESET = get_or_create_env_var('LLM_RESET', 'True')
 LLM_STREAM = get_or_create_env_var('LLM_STREAM', 'False')
 LLM_THREADS = int(get_or_create_env_var('LLM_THREADS', '4'))
-LLM_BATCH_SIZE = int(get_or_create_env_var('LLM_BATCH_SIZE', '256'))
 LLM_CONTEXT_LENGTH = int(get_or_create_env_var('LLM_CONTEXT_LENGTH', '16384'))
 LLM_SAMPLE = get_or_create_env_var('LLM_SAMPLE', 'True')
 MAX_GROUPS = int(get_or_create_env_var('MAX_GROUPS', '99'))

 model_short_names = []
 model_source = []
+CHOSEN_LOCAL_MODEL_TYPE = get_or_create_env_var("CHOSEN_LOCAL_MODEL_TYPE", "gpt-oss-20b") # Gemma 3 1B #  "Gemma 2b" # "Gemma 3 4B"
 if RUN_LOCAL_MODEL == "1" and CHOSEN_LOCAL_MODEL_TYPE:
     model_full_names.append(CHOSEN_LOCAL_MODEL_TYPE)
 # HF token may or may not be needed for downloading models from Hugging Face
 HF_TOKEN = get_or_create_env_var('HF_TOKEN', '')
+GEMMA2_REPO_ID = get_or_create_env_var("GEMMA2_2B_REPO_ID", "lmstudio-community/gemma-2-2b-it-GGUF")
+GEMMA2_MODEL_FILE = get_or_create_env_var("GEMMA2_2B_MODEL_FILE", "gemma-2-2b-it-Q8_0.gguf")
+GEMMA2_MODEL_FOLDER = get_or_create_env_var("GEMMA2_2B_MODEL_FOLDER", "model/gemma")
+GEMMA3_REPO_ID = get_or_create_env_var("GEMMA3_REPO_ID", "unsloth/gemma-3-270m-it-qat-GGUF")
+GEMMA3_MODEL_FILE = get_or_create_env_var("GEMMA3_MODEL_FILE", "gemma-3-270m-it-qat-F16.gguf")
 GEMMA3_MODEL_FOLDER = get_or_create_env_var("GEMMA3_MODEL_FOLDER", "model/gemma")
+GEMMA3_4B_REPO_ID = get_or_create_env_var("GEMMA3_4B_REPO_ID", "unsloth/gemma-3-4b-it-qat-GGUF")
+GEMMA3_4B_MODEL_FILE = get_or_create_env_var("GEMMA3_4B_MODEL_FILE", "gemma-3-4b-it-qat-Q4_K_M.gguf")
 GEMMA3_4B_MODEL_FOLDER = get_or_create_env_var("GEMMA3_4B_MODEL_FOLDER", "model/gemma3_4b")
+GPT_OSS_REPO_ID = get_or_create_env_var("GPT_OSS_REPO_ID", "unsloth/gpt-oss-20b-GGUF")
+GPT_OSS_MODEL_FILE = get_or_create_env_var("GPT_OSS_MODEL_FILE", "gpt-oss-20b-F16.gguf")
+GPT_OSS_MODEL_FOLDER = get_or_create_env_var("GPT_OSS_MODEL_FOLDER", "model/gpt_oss")
+USE_SPECULATIVE_DECODING = get_or_create_env_var("USE_SPECULATIVE_DECODING", "False")
+GEMMA3_DRAFT_MODEL_LOC = get_or_create_env_var("GEMMA3_DRAFT_MODEL_LOC", ".cache/llama.cpp/unsloth_gemma-3-270m-it-qat-GGUF_gemma-3-270m-it-qat-F16.gguf")
+GEMMA3_4B_DRAFT_MODEL_LOC = get_or_create_env_var("GEMMA3_4B_DRAFT_MODEL_LOC", ".cache/llama.cpp/unsloth_gemma-3-4b-it-qat-GGUF_gemma-3-4b-it-qat-Q4_K_M.gguf")
 if CHOSEN_LOCAL_MODEL_TYPE == "Gemma 2b":
     LOCAL_REPO_ID = GEMMA2_REPO_ID
     LOCAL_MODEL_FILE = GEMMA2_MODEL_FILE
     LOCAL_MODEL_FILE = GEMMA3_4B_MODEL_FILE
     LOCAL_MODEL_FOLDER = GEMMA3_4B_MODEL_FOLDER
+elif CHOSEN_LOCAL_MODEL_TYPE == "gpt-oss-20b":
+    LOCAL_REPO_ID = GPT_OSS_REPO_ID
+    LOCAL_MODEL_FILE = GPT_OSS_MODEL_FILE
+    LOCAL_MODEL_FOLDER = GPT_OSS_MODEL_FOLDER
     print("CHOSEN_LOCAL_MODEL_TYPE:", CHOSEN_LOCAL_MODEL_TYPE)
     print("LOCAL_REPO_ID:", LOCAL_REPO_ID)
     print("LOCAL_MODEL_FILE:", LOCAL_MODEL_FILE)
     print("LOCAL_MODEL_FOLDER:", LOCAL_MODEL_FOLDER)
+LLM_MAX_GPU_LAYERS = int(get_or_create_env_var('LLM_MAX_GPU_LAYERS','-1')) # Maximum possible
 LLM_TEMPERATURE = float(get_or_create_env_var('LLM_TEMPERATURE', '0.1'))
+LLM_TOP_K = int(get_or_create_env_var('LLM_TOP_K','96'))
+LLM_MIN_P = float(get_or_create_env_var('LLM_MIN_P', '0'))
+LLM_TOP_P = float(get_or_create_env_var('LLM_TOP_P', '0.95'))
+LLM_REPETITION_PENALTY = float(get_or_create_env_var('LLM_REPETITION_PENALTY', '1.0'))
 LLM_LAST_N_TOKENS = int(get_or_create_env_var('LLM_LAST_N_TOKENS', '512'))
 LLM_MAX_NEW_TOKENS = int(get_or_create_env_var('LLM_MAX_NEW_TOKENS', '4096'))
 LLM_SEED = int(get_or_create_env_var('LLM_SEED', '42'))
 LLM_RESET = get_or_create_env_var('LLM_RESET', 'True')
 LLM_STREAM = get_or_create_env_var('LLM_STREAM', 'False')
 LLM_THREADS = int(get_or_create_env_var('LLM_THREADS', '4'))
+LLM_BATCH_SIZE = int(get_or_create_env_var('LLM_BATCH_SIZE', '128'))
 LLM_CONTEXT_LENGTH = int(get_or_create_env_var('LLM_CONTEXT_LENGTH', '16384'))
 LLM_SAMPLE = get_or_create_env_var('LLM_SAMPLE', 'True')
+SPECULATIVE_DECODING = get_or_create_env_var('SPECULATIVE_DECODING', 'False')
+NUM_PRED_TOKENS = int(get_or_create_env_var('NUM_PRED_TOKENS', '2'))
+REASONING_SUFFIX = get_or_create_env_var('REASONING_SUFFIX', 'Reasoning: low')
 MAX_GROUPS = int(get_or_create_env_var('MAX_GROUPS', '99'))

tools/custom_csvlogger.py CHANGED Viewed

@@ -106,7 +106,7 @@ class CSVLogger_custom(FlaggingCallback):
                 )
                 latest_num = int(re.findall(r"\d+", latest_file.stem)[0])
-                with open(latest_file, newline="", encoding="utf-8") as csvfile:
                     reader = csv.reader(csvfile)
                     existing_headers = next(reader, None)
@@ -122,7 +122,7 @@ class CSVLogger_custom(FlaggingCallback):
         if not Path(self.dataset_filepath).exists():
             with open(
-                self.dataset_filepath, "w", newline="", encoding="utf-8"
             ) as csvfile:
                 writer = csv.writer(csvfile)
                 writer.writerow(utils.sanitize_list_for_csv(headers))
@@ -202,15 +202,12 @@ class CSVLogger_custom(FlaggingCallback):
         if save_to_csv:
             with self.lock:
-                with open(self.dataset_filepath, "a", newline="", encoding="utf-8") as csvfile:
                     writer = csv.writer(csvfile)
                     writer.writerow(utils.sanitize_list_for_csv(csv_data))
-                with open(self.dataset_filepath, encoding="utf-8") as csvfile:
                     line_count = len(list(csv.reader(csvfile))) - 1
-        print("save_to_dynamodb:", save_to_dynamodb)
-        print("save_to_dynamodb == True:", save_to_dynamodb == True)
         if save_to_dynamodb == True:
             print("Saving to DynamoDB")

                 )
                 latest_num = int(re.findall(r"\d+", latest_file.stem)[0])
+                with open(latest_file, newline="", encoding="utf-8-sig") as csvfile:
                     reader = csv.reader(csvfile)
                     existing_headers = next(reader, None)
         if not Path(self.dataset_filepath).exists():
             with open(
+                self.dataset_filepath, "w", newline="", encoding="utf-8-sig"
             ) as csvfile:
                 writer = csv.writer(csvfile)
                 writer.writerow(utils.sanitize_list_for_csv(headers))
         if save_to_csv:
             with self.lock:
+                with open(self.dataset_filepath, "a", newline="", encoding="utf-8-sig") as csvfile:
                     writer = csv.writer(csvfile)
                     writer.writerow(utils.sanitize_list_for_csv(csv_data))
+                with open(self.dataset_filepath, encoding="utf-8-sig") as csvfile:
                     line_count = len(list(csv.reader(csvfile))) - 1
         if save_to_dynamodb == True:
             print("Saving to DynamoDB")

tools/dedup_summaries.py CHANGED Viewed

@@ -436,7 +436,7 @@ def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:
     if isinstance(responses[-1], ResponseObject):
         response_texts = [resp.text for resp in responses]
     elif "choices" in responses[-1]:
-        response_texts = [resp["choices"][0]['text'] for resp in responses]
     else:
         response_texts = [resp.text for resp in responses]

     if isinstance(responses[-1], ResponseObject):
         response_texts = [resp.text for resp in responses]
     elif "choices" in responses[-1]:
+        response_texts = [resp["choices"][0]['message']['content'] for resp in responses] #resp["choices"][0]['text'] for resp in responses]
     else:
         response_texts = [resp.text for resp in responses]

tools/llm_api_call.py CHANGED Viewed

@@ -17,7 +17,7 @@ GradioFileData = gr.FileData
 from tools.prompts import initial_table_prompt, prompt2, prompt3, initial_table_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt,  force_existing_topics_prompt, allow_new_topics_prompt, force_single_topic_prompt, add_existing_topics_assistant_prefill, initial_table_assistant_prefill, structured_summary_prompt
 from tools.helper_functions import read_file, put_columns_in_df, wrap_text, initial_clean, load_in_data_file, load_in_file, create_topic_summary_df_from_reference_table, convert_reference_table_to_pivot_table, get_basic_response_data, clean_column_name, load_in_previous_data_files
 from tools.llm_funcs import ResponseObject, construct_gemini_generative_model, call_llm_with_markdown_table_checks, create_missing_references_df, calculate_tokens_from_metadata
-from tools.config import RUN_LOCAL_MODEL, AWS_REGION, MAX_COMMENT_CHARS, MAX_OUTPUT_VALIDATION_ATTEMPTS, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, RUN_AWS_FUNCTIONS, model_name_map, OUTPUT_FOLDER, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED, MAX_GROUPS
 from tools.aws_functions import connect_to_bedrock_runtime
 if RUN_LOCAL_MODEL == "1":
@@ -31,11 +31,12 @@ batch_size_default = BATCH_SIZE_DEFAULT
 deduplication_threshold = DEDUPLICATION_THRESHOLD
 max_comment_character_length = MAX_COMMENT_CHARS
 random_seed = LLM_SEED
 # if RUN_AWS_FUNCTIONS == '1':
 #     bedrock_runtime = boto3.client('bedrock-runtime', region_name=AWS_REGION)
 # else:
-#     bedrock_runtime = []
@@ -130,7 +131,7 @@ def clean_markdown_table(text: str):
     lines = text.splitlines()
     # Step 1: Identify table structure and process line continuations
-    table_rows = []
     current_row = None
     for line in lines:
@@ -174,7 +175,7 @@ def clean_markdown_table(text: str):
         max_columns = max(max_columns, len(cells))
     # Now format each row
-    formatted_rows = []
     for row in table_rows:
         # Ensure the row starts and ends with pipes
         if not row.startswith('|'):
@@ -354,7 +355,7 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
     - first_run (bool): A boolean indicating if this is the first run through this function in this process. Defaults to False.
     - output_folder (str): The name of the folder where output files are saved.
     """
-    topic_summary_df_out_path = []
     topic_table_out_path = "topic_table_error.csv"
     reference_table_out_path = "reference_table_error.csv"
     topic_summary_df_out_path = "unique_topic_table_error.csv"
@@ -390,7 +391,7 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
     log_files_output_paths.append(whole_conversation_path_meta)
     if isinstance(responses[-1], ResponseObject): response_text =  responses[-1].text
-    elif "choices" in responses[-1]: response_text =  responses[-1]["choices"][0]['text']
     else: response_text =  responses[-1].text
     # Convert response text to a markdown table
@@ -426,7 +427,7 @@ def write_llm_output_and_logs(responses: List[ResponseObject],
     topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean  + ".csv"
     # Table to map references to topics
-    reference_data = []
     batch_basic_response_df["Reference"] = batch_basic_response_df["Reference"].astype(str)
@@ -614,13 +615,7 @@ def generate_zero_shot_topics_df(zero_shot_topics:pd.DataFrame,
         if force_zero_shot_radio == "Yes":
             zero_shot_topics_gen_topics_list.append("")
             zero_shot_topics_subtopics_list.append("No relevant topic")
-            zero_shot_topics_description_list.append("")
-        # This process was abandoned (revising the general topics) as it didn't seem to work
-        # if create_revised_general_topics == True:
-        #     pass
-        # else:
-        #     pass
         # Add description or not
         zero_shot_topics_df = pd.DataFrame(data={
@@ -646,9 +641,9 @@ def extract_topics(in_data_file: GradioFileData,
               model_choice:str,
               candidate_topics: GradioFileData = None,
               latest_batch_completed:int=0,
-              out_message:List=[],
-              out_file_paths:List = [],
-              log_files_output_paths:List = [],
               first_loop_state:bool=False,
               whole_conversation_metadata_str:str="",
               initial_table_prompt:str=initial_table_prompt,
@@ -663,7 +658,7 @@ def extract_topics(in_data_file: GradioFileData,
               time_taken:float = 0,
               sentiment_checkbox:str = "Negative, Neutral, or Positive",
               force_zero_shot_radio:str = "No",
-              in_excel_sheets:List[str] = [],
               force_single_topic_radio:str = "No",
               output_folder:str=OUTPUT_FOLDER,
               force_single_topic_prompt:str=force_single_topic_prompt,
@@ -675,6 +670,7 @@ def extract_topics(in_data_file: GradioFileData,
               model_name_map:dict=model_name_map,
               max_time_for_loop:int=max_time_for_loop,
               CHOSEN_LOCAL_MODEL_TYPE:str=CHOSEN_LOCAL_MODEL_TYPE,
               progress=Progress(track_tqdm=True)):
     '''
@@ -723,31 +719,36 @@ def extract_topics(in_data_file: GradioFileData,
     - model_name_map (dict, optional): A dictionary mapping full model name to shortened.
     - max_time_for_loop (int, optional): The number of seconds maximum that the function should run for before breaking (to run again, this is to avoid timeouts with some AWS services if deployed there).
     - CHOSEN_LOCAL_MODEL_TYPE (str, optional): The name of the chosen local model.
     - progress (Progress): A progress tracker.
     '''
     tic = time.perf_counter()
-    google_client = []
     google_config = {}
     final_time = 0.0
-    whole_conversation_metadata = []
     is_error = False
     create_revised_general_topics = False
-    local_model = []
-    tokenizer = []
     zero_shot_topics_df = pd.DataFrame()
     missing_df = pd.DataFrame()
     new_reference_df = pd.DataFrame(columns=["Response References",	"General topic",	"Subtopic",	"Sentiment",	"Start row of group",	"Group"	,"Topic_number",	"Summary"])
     new_topic_summary_df = pd.DataFrame(columns=["General topic","Subtopic","Sentiment","Group","Number of responses","Summary"])
     new_topic_df = pd.DataFrame()
-    #llama_system_prefix = "<|start_header_id|>system<|end_header_id|>\n" #"<start_of_turn>user\n"
-    #llama_system_suffix = "<|eot_id|>" #"<end_of_turn>\n<start_of_turn>model\n"
-    #llama_cpp_prefix = "<|start_header_id|>system<|end_header_id|>\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n" #"<start_of_turn>user\n"
-    #llama_cpp_suffix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" #"<end_of_turn>\n<start_of_turn>model\n"
-    #llama_cpp_prefix = "<|user|>\n" # This is for phi 3.5
-    #llama_cpp_suffix = "<|end|>\n<|assistant|>|" # This is for phi 3.5
-    llama_cpp_prefix = "<start_of_turn>user\n"
-    llama_cpp_suffix = "<end_of_turn>\n<start_of_turn>model\n"
     #print("output_folder:", output_folder)
@@ -773,8 +774,8 @@ def extract_topics(in_data_file: GradioFileData,
         print("This is the first time through the loop, resetting latest_batch_completed to 0")
         if (latest_batch_completed == 999) | (latest_batch_completed == 0):
             latest_batch_completed = 0
-            out_message = []
-            out_file_paths = []
             final_time = 0
             if (model_choice == CHOSEN_LOCAL_MODEL_TYPE) & (RUN_LOCAL_MODEL == "1"):
@@ -795,7 +796,7 @@ def extract_topics(in_data_file: GradioFileData,
             out_message = [out_message]
         if not out_file_paths:
-            out_file_paths = []
         if "anthropic.claude-3-sonnet" in model_choice and file_data.shape[1] > 300:
@@ -819,7 +820,7 @@ def extract_topics(in_data_file: GradioFileData,
             simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, latest_batch_completed, batch_size)
             # Conversation history
-            conversation_history = []
             # If the latest batch of responses contains at least one instance of text
             if not batch_basic_response_df.empty:
@@ -933,13 +934,15 @@ def extract_topics(in_data_file: GradioFileData,
                     except Exception as e:
                         print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
-                    if "gemma" in model_choice:
-                        summary_prompt_list = [full_prompt] # Includes system prompt
-                    else:
-                        summary_prompt_list = [formatted_summary_prompt]
-                    conversation_history = []
-                    whole_conversation = []
                     # Process requests to large language model
                     responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, formatted_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=add_existing_topics_assistant_prefill, master = True)
@@ -953,13 +956,16 @@ def extract_topics(in_data_file: GradioFileData,
                         if isinstance(responses[-1], ResponseObject):
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
-                                f.write(responses[-1].text)
                         elif "choices" in responses[-1]:
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
-                                f.write(responses[-1]["choices"][0]['text'])
                         else:
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
-                                f.write(responses[-1].text)
                     except Exception as e:
                         print("Error in returning model response:", e)
@@ -1020,26 +1026,23 @@ def extract_topics(in_data_file: GradioFileData,
                     if prompt3: formatted_prompt3 = prompt3.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt)
                     else: formatted_prompt3 = prompt3
-                    if "gemma" in model_choice:
-                        formatted_initial_table_prompt = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_initial_table_prompt + llama_cpp_suffix
-                        formatted_prompt2 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt2 + llama_cpp_suffix
-                        formatted_prompt3 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt3 + llama_cpp_suffix
-                    batch_prompts = [formatted_initial_table_prompt, formatted_prompt2, formatted_prompt3][:number_of_prompts_used]  # Adjust this list to send fewer requests
-                    whole_conversation = []
                     responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, formatted_initial_table_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=initial_table_assistant_prefill)
                     topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_table_df, reference_df, new_topic_summary_df, batch_file_path_details, is_error =  write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, group_name, produce_structures_summary_radio, first_run=True, output_folder=output_folder)
                     # If error in table parsing, leave function
-                    if is_error == True:
-                        raise Exception("Error in output table parsing")
-                        # unique_table_df_display_table_markdown, new_topic_df, new_topic_summary_df, new_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths#, final_message_out
-                    #all_topic_tables_df.append(topic_table_df)
                     topic_table_df.to_csv(topic_table_out_path, index=None)
                     out_file_paths.append(topic_table_out_path)
@@ -1056,28 +1059,28 @@ def extract_topics(in_data_file: GradioFileData,
                     new_topic_summary_df.to_csv(topic_summary_df_out_path, index=None)
                     out_file_paths.append(topic_summary_df_out_path)
-                    #all_markdown_topic_tables.append(markdown_table)
                     whole_conversation_metadata.append(whole_conversation_metadata_str)
                     whole_conversation_metadata_str = '. '.join(whole_conversation_metadata)
-                    # Write final output to text file also
                     # Write final output to text file for logging purposes
                     try:
                         final_table_output_path = output_folder + batch_file_path_details + "_full_response_" + model_choice_clean + ".txt"
                         if isinstance(responses[-1], ResponseObject):
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
-                                f.write(responses[-1].text)
                         elif "choices" in responses[-1]:
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
-                                f.write(responses[-1]["choices"][0]['text'])
                         else:
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
-                                f.write(responses[-1].text)
-                    except Exception as e:
-                        print("Error in returning model response:", e)
                     new_topic_df = topic_table_df
                     new_reference_df = reference_df
@@ -1122,7 +1125,7 @@ def extract_topics(in_data_file: GradioFileData,
         # Set to a very high number so as not to mess with subsequent file processing by the user
         #latest_batch_completed = 999
-        join_file_paths = []
         toc = time.perf_counter()
         final_time = (toc - tic) + time_taken
@@ -1198,8 +1201,12 @@ def extract_topics(in_data_file: GradioFileData,
         print("latest_batch_completed at end of batch iterations to return is", latest_batch_completed)
         return unique_table_df_display_table_markdown, existing_topics_table, final_out_topic_summary_df, existing_reference_df, final_out_file_paths, final_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, final_out_file_paths, final_out_file_paths, modifiable_topic_summary_df, final_out_file_paths, join_file_paths, existing_reference_df_pivot, missing_df
     return unique_table_df_display_table_markdown, existing_topics_table, existing_topic_summary_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, modifiable_topic_summary_df, out_file_paths, join_file_paths, existing_reference_df_pivot, missing_df # gr.Dataframe(value=modifiable_topic_summary_df, headers=None, col_count=(modifiable_topic_summary_df.shape[1], "fixed"), row_count = (modifiable_topic_summary_df.shape[0], "fixed"), visible=True, type="pandas"),
 def wrapper_extract_topics_per_column_value(
@@ -1237,7 +1244,7 @@ def wrapper_extract_topics_per_column_value(
     context_textbox: str = "",
     sentiment_checkbox: str = "Negative, Neutral, or Positive",
     force_zero_shot_radio: str = "No",
-    in_excel_sheets: List[str] = [],
     force_single_topic_radio: str = "No",
     produce_structures_summary_radio: str = "No",
     aws_access_key_textbox:str="",
@@ -1279,9 +1286,9 @@ def wrapper_extract_topics_per_column_value(
     acc_missing_df = pd.DataFrame()
     # Lists are extended
-    acc_out_file_paths = []
-    acc_log_files_output_paths = []
-    acc_join_file_paths = [] # join_file_paths seems to be overwritten, so maybe last one or extend? Let's extend.
     # Single value outputs - typically the last one is most relevant, or sum for time
     acc_markdown_output = initial_unique_table_df_display_table_markdown
@@ -1353,9 +1360,9 @@ def wrapper_extract_topics_per_column_value(
                 num_batches=current_num_batches,
                 latest_batch_completed=current_latest_batch_completed, # Reset for each new segment's internal batching
                 first_loop_state=current_first_loop_state, # True only for the very first iteration of wrapper
-                out_message=[], # Fresh for each call
-                out_file_paths=[],# Fresh for each call
-                log_files_output_paths=[],# Fresh for each call
                 whole_conversation_metadata_str="", # Fresh for each call
                 time_taken=0, # Time taken for this specific call, wrapper sums it.
                 # Pass through other parameters
@@ -1450,8 +1457,12 @@ def wrapper_extract_topics_per_column_value(
         unique_table_df_display_table = acc_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
         acc_markdown_output = unique_table_df_display_table[["General topic", "Subtopic", "Sentiment", "Number of responses", "Summary", "Group"]].to_markdown(index=False)
     acc_input_tokens, acc_output_tokens, acc_number_of_calls = calculate_tokens_from_metadata(acc_whole_conversation_metadata, model_choice, model_name_map)
     print(f"\nWrapper finished processing all segments. Total time: {acc_total_time_taken:.2f}s")
     # The return signature should match extract_topics.
@@ -1537,7 +1548,7 @@ def modify_existing_output_tables(original_topic_summary_df:pd.DataFrame, modifi
     reference_file_path = os.path.basename(reference_files[0]) if reference_files else None
     unique_table_file_path = os.path.basename(unique_files[0]) if unique_files else None
-    output_file_list = []
     if reference_file_path and unique_table_file_path:

 from tools.prompts import initial_table_prompt, prompt2, prompt3, initial_table_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt,  force_existing_topics_prompt, allow_new_topics_prompt, force_single_topic_prompt, add_existing_topics_assistant_prefill, initial_table_assistant_prefill, structured_summary_prompt
 from tools.helper_functions import read_file, put_columns_in_df, wrap_text, initial_clean, load_in_data_file, load_in_file, create_topic_summary_df_from_reference_table, convert_reference_table_to_pivot_table, get_basic_response_data, clean_column_name, load_in_previous_data_files
 from tools.llm_funcs import ResponseObject, construct_gemini_generative_model, call_llm_with_markdown_table_checks, create_missing_references_df, calculate_tokens_from_metadata
+from tools.config import RUN_LOCAL_MODEL, AWS_REGION, MAX_COMMENT_CHARS, MAX_OUTPUT_VALIDATION_ATTEMPTS, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, RUN_AWS_FUNCTIONS, model_name_map, OUTPUT_FOLDER, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED, MAX_GROUPS, REASONING_SUFFIX
 from tools.aws_functions import connect_to_bedrock_runtime
 if RUN_LOCAL_MODEL == "1":
 deduplication_threshold = DEDUPLICATION_THRESHOLD
 max_comment_character_length = MAX_COMMENT_CHARS
 random_seed = LLM_SEED
+reasoning_suffix = REASONING_SUFFIX
 # if RUN_AWS_FUNCTIONS == '1':
 #     bedrock_runtime = boto3.client('bedrock-runtime', region_name=AWS_REGION)
 # else:
+#     bedrock_runtime = list()
     lines = text.splitlines()
     # Step 1: Identify table structure and process line continuations
+    table_rows = list()
     current_row = None
     for line in lines:
         max_columns = max(max_columns, len(cells))
     # Now format each row
+    formatted_rows = list()
     for row in table_rows:
         # Ensure the row starts and ends with pipes
         if not row.startswith('|'):
     - first_run (bool): A boolean indicating if this is the first run through this function in this process. Defaults to False.
     - output_folder (str): The name of the folder where output files are saved.
     """
+    topic_summary_df_out_path = list()
     topic_table_out_path = "topic_table_error.csv"
     reference_table_out_path = "reference_table_error.csv"
     topic_summary_df_out_path = "unique_topic_table_error.csv"
     log_files_output_paths.append(whole_conversation_path_meta)
     if isinstance(responses[-1], ResponseObject): response_text =  responses[-1].text
+    elif "choices" in responses[-1]: response_text =  responses[-1]['choices'][0]['message']['content'] #responses[-1]["choices"][0]['text']
     else: response_text =  responses[-1].text
     # Convert response text to a markdown table
     topic_table_out_path = output_folder + batch_file_path_details + "_topic_table_" + model_choice_clean  + ".csv"
     # Table to map references to topics
+    reference_data = list()
     batch_basic_response_df["Reference"] = batch_basic_response_df["Reference"].astype(str)
         if force_zero_shot_radio == "Yes":
             zero_shot_topics_gen_topics_list.append("")
             zero_shot_topics_subtopics_list.append("No relevant topic")
+            zero_shot_topics_description_list.append("")
         # Add description or not
         zero_shot_topics_df = pd.DataFrame(data={
               model_choice:str,
               candidate_topics: GradioFileData = None,
               latest_batch_completed:int=0,
+              out_message:List= list(),
+              out_file_paths:List = list(),
+              log_files_output_paths:List = list(),
               first_loop_state:bool=False,
               whole_conversation_metadata_str:str="",
               initial_table_prompt:str=initial_table_prompt,
               time_taken:float = 0,
               sentiment_checkbox:str = "Negative, Neutral, or Positive",
               force_zero_shot_radio:str = "No",
+              in_excel_sheets:List[str] = list(),
               force_single_topic_radio:str = "No",
               output_folder:str=OUTPUT_FOLDER,
               force_single_topic_prompt:str=force_single_topic_prompt,
               model_name_map:dict=model_name_map,
               max_time_for_loop:int=max_time_for_loop,
               CHOSEN_LOCAL_MODEL_TYPE:str=CHOSEN_LOCAL_MODEL_TYPE,
+              reasoning_suffix:str=reasoning_suffix,
               progress=Progress(track_tqdm=True)):
     '''
     - model_name_map (dict, optional): A dictionary mapping full model name to shortened.
     - max_time_for_loop (int, optional): The number of seconds maximum that the function should run for before breaking (to run again, this is to avoid timeouts with some AWS services if deployed there).
     - CHOSEN_LOCAL_MODEL_TYPE (str, optional): The name of the chosen local model.
+    - reasoning_suffix (str, optional): The suffix for the reasoning system prompt.
     - progress (Progress): A progress tracker.
     '''
     tic = time.perf_counter()
+    google_client = list()
     google_config = {}
     final_time = 0.0
+    whole_conversation_metadata = list()
     is_error = False
     create_revised_general_topics = False
+    local_model = list()
+    tokenizer = list()
     zero_shot_topics_df = pd.DataFrame()
     missing_df = pd.DataFrame()
     new_reference_df = pd.DataFrame(columns=["Response References",	"General topic",	"Subtopic",	"Sentiment",	"Start row of group",	"Group"	,"Topic_number",	"Summary"])
     new_topic_summary_df = pd.DataFrame(columns=["General topic","Subtopic","Sentiment","Group","Number of responses","Summary"])
     new_topic_df = pd.DataFrame()
+    # For Gemma models
+    #llama_cpp_prefix = "<start_of_turn>user\n"
+    #llama_cpp_suffix = "<end_of_turn>\n<start_of_turn>model\n"
+    # For GPT OSS
+    #llama_cpp_prefix = "<|start|>assistant<|channel|>analysis<|message|>\n"
+    #llama_cpp_suffix = "<|start|>assistant<|channel|>final<|message|>"
+    # Blank
+    llama_cpp_prefix = ""
+    llama_cpp_suffix = ""
     #print("output_folder:", output_folder)
         print("This is the first time through the loop, resetting latest_batch_completed to 0")
         if (latest_batch_completed == 999) | (latest_batch_completed == 0):
             latest_batch_completed = 0
+            out_message = list()
+            out_file_paths = list()
             final_time = 0
             if (model_choice == CHOSEN_LOCAL_MODEL_TYPE) & (RUN_LOCAL_MODEL == "1"):
             out_message = [out_message]
         if not out_file_paths:
+            out_file_paths = list()
         if "anthropic.claude-3-sonnet" in model_choice and file_data.shape[1] > 300:
             simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, latest_batch_completed, batch_size)
             # Conversation history
+            conversation_history = list()
             # If the latest batch of responses contains at least one instance of text
             if not batch_basic_response_df.empty:
                     except Exception as e:
                         print(f"Error writing prompt to file {formatted_prompt_output_path}: {e}")
+                    #if "Local" in model_source:
+                    #    summary_prompt_list = [full_prompt] # Includes system prompt
+                    #else:
+                    summary_prompt_list = [formatted_summary_prompt]
+                    if "Local" in model_source and reasoning_suffix: formatted_system_prompt = formatted_system_prompt + "\n" + reasoning_suffix
+                    conversation_history = list()
+                    whole_conversation = list()
                     # Process requests to large language model
                     responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, formatted_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=add_existing_topics_assistant_prefill, master = True)
                         if isinstance(responses[-1], ResponseObject):
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
+                                #f.write(responses[-1].text)
+                                f.write(response_text)
                         elif "choices" in responses[-1]:
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
+                                #f.write(responses[-1]["choices"][0]['text'])
+                                f.write(response_text)
                         else:
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
+                                #f.write(responses[-1].text)
+                                f.write(response_text)
                     except Exception as e:
                         print("Error in returning model response:", e)
                     if prompt3: formatted_prompt3 = prompt3.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt)
                     else: formatted_prompt3 = prompt3
+                    #if "Local" in model_source:
+                    #formatted_initial_table_prompt = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_initial_table_prompt + llama_cpp_suffix
+                    #formatted_prompt2 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt2 + llama_cpp_suffix
+                    #formatted_prompt3 = llama_cpp_prefix + formatted_initial_table_system_prompt + "\n" + formatted_prompt3 + llama_cpp_suffix
+                    batch_prompts = [formatted_initial_table_prompt, formatted_prompt2, formatted_prompt3][:number_of_prompts_used]  # Adjust this list to send fewer requests
+                    if "Local" in model_source and reasoning_suffix: formatted_initial_table_system_prompt = formatted_initial_table_system_prompt + "\n" + reasoning_suffix
+                    whole_conversation = list()
                     responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, formatted_initial_table_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=initial_table_assistant_prefill)
                     topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_table_df, reference_df, new_topic_summary_df, batch_file_path_details, is_error =  write_llm_output_and_logs(responses, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, group_name, produce_structures_summary_radio, first_run=True, output_folder=output_folder)
                     # If error in table parsing, leave function
+                    if is_error == True: raise Exception("Error in output table parsing")
                     topic_table_df.to_csv(topic_table_out_path, index=None)
                     out_file_paths.append(topic_table_out_path)
                     new_topic_summary_df.to_csv(topic_summary_df_out_path, index=None)
                     out_file_paths.append(topic_summary_df_out_path)
                     whole_conversation_metadata.append(whole_conversation_metadata_str)
                     whole_conversation_metadata_str = '. '.join(whole_conversation_metadata)
                     # Write final output to text file for logging purposes
                     try:
                         final_table_output_path = output_folder + batch_file_path_details + "_full_response_" + model_choice_clean + ".txt"
                         if isinstance(responses[-1], ResponseObject):
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
+                                #f.write(responses[-1].text)
+                                f.write(response_text)
                         elif "choices" in responses[-1]:
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
+                                #f.write(responses[-1]["choices"][0]['text'])
+                                f.write(response_text)
                         else:
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
+                                #f.write(responses[-1].text)
+                                f.write(response_text)
+                    except Exception as e: print("Error in returning model response:", e)
                     new_topic_df = topic_table_df
                     new_reference_df = reference_df
         # Set to a very high number so as not to mess with subsequent file processing by the user
         #latest_batch_completed = 999
+        join_file_paths = list()
         toc = time.perf_counter()
         final_time = (toc - tic) + time_taken
         print("latest_batch_completed at end of batch iterations to return is", latest_batch_completed)
+        print("whole_conversation_metadata_str at end of batch iterations to return is", whole_conversation_metadata_str)
         return unique_table_df_display_table_markdown, existing_topics_table, final_out_topic_summary_df, existing_reference_df, final_out_file_paths, final_out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, final_out_file_paths, final_out_file_paths, modifiable_topic_summary_df, final_out_file_paths, join_file_paths, existing_reference_df_pivot, missing_df
+    print("whole_conversation_metadata_str at end of batch iterations to return is", whole_conversation_metadata_str)
     return unique_table_df_display_table_markdown, existing_topics_table, existing_topic_summary_df, existing_reference_df, out_file_paths, out_file_paths, latest_batch_completed, log_files_output_paths, log_files_output_paths, whole_conversation_metadata_str, final_time, out_file_paths, out_file_paths, modifiable_topic_summary_df, out_file_paths, join_file_paths, existing_reference_df_pivot, missing_df # gr.Dataframe(value=modifiable_topic_summary_df, headers=None, col_count=(modifiable_topic_summary_df.shape[1], "fixed"), row_count = (modifiable_topic_summary_df.shape[0], "fixed"), visible=True, type="pandas"),
 def wrapper_extract_topics_per_column_value(
     context_textbox: str = "",
     sentiment_checkbox: str = "Negative, Neutral, or Positive",
     force_zero_shot_radio: str = "No",
+    in_excel_sheets: List[str] = list(),
     force_single_topic_radio: str = "No",
     produce_structures_summary_radio: str = "No",
     aws_access_key_textbox:str="",
     acc_missing_df = pd.DataFrame()
     # Lists are extended
+    acc_out_file_paths = list()
+    acc_log_files_output_paths = list()
+    acc_join_file_paths = list() # join_file_paths seems to be overwritten, so maybe last one or extend? Let's extend.
     # Single value outputs - typically the last one is most relevant, or sum for time
     acc_markdown_output = initial_unique_table_df_display_table_markdown
                 num_batches=current_num_batches,
                 latest_batch_completed=current_latest_batch_completed, # Reset for each new segment's internal batching
                 first_loop_state=current_first_loop_state, # True only for the very first iteration of wrapper
+                out_message= list(), # Fresh for each call
+                out_file_paths= list(),# Fresh for each call
+                log_files_output_paths= list(),# Fresh for each call
                 whole_conversation_metadata_str="", # Fresh for each call
                 time_taken=0, # Time taken for this specific call, wrapper sums it.
                 # Pass through other parameters
         unique_table_df_display_table = acc_topic_summary_df.apply(lambda col: col.map(lambda x: wrap_text(x, max_text_length=500)))
         acc_markdown_output = unique_table_df_display_table[["General topic", "Subtopic", "Sentiment", "Number of responses", "Summary", "Group"]].to_markdown(index=False)
+    print("acc_whole_conversation_metadata at end of wrapper is", acc_whole_conversation_metadata)
     acc_input_tokens, acc_output_tokens, acc_number_of_calls = calculate_tokens_from_metadata(acc_whole_conversation_metadata, model_choice, model_name_map)
+    print("acc_input_tokens, acc_output_tokens, acc_number_of_calls at end of wrapper is", acc_input_tokens, acc_output_tokens, acc_number_of_calls)
     print(f"\nWrapper finished processing all segments. Total time: {acc_total_time_taken:.2f}s")
     # The return signature should match extract_topics.
     reference_file_path = os.path.basename(reference_files[0]) if reference_files else None
     unique_table_file_path = os.path.basename(unique_files[0]) if unique_files else None
+    output_file_list = list()
     if reference_file_path and unique_table_file_path:

tools/llm_funcs.py CHANGED Viewed

@@ -7,6 +7,7 @@ import pandas as pd
 import json
 from tqdm import tqdm
 from huggingface_hub import hf_hub_download
 from typing import List, Tuple, TypeVar
 from google import genai as ai
 from google.genai import types
@@ -17,12 +18,18 @@ torch.cuda.empty_cache()
 model_type = None # global variable setup
 full_text = "" # Define dummy source text (full text) just to enable highlight function to load
-model = [] # Define empty list for model functions to run
-tokenizer = [] #[] # Define empty list for model functions to run
-from tools.config import RUN_AWS_FUNCTIONS, AWS_REGION, LLM_TEMPERATURE, LLM_TOP_K, LLM_TOP_P, LLM_REPETITION_PENALTY, LLM_LAST_N_TOKENS, LLM_MAX_NEW_TOKENS, LLM_SEED, LLM_RESET, LLM_STREAM, LLM_THREADS, LLM_BATCH_SIZE, LLM_CONTEXT_LENGTH, LLM_SAMPLE, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, MAX_COMMENT_CHARS, RUN_LOCAL_MODEL, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, HF_TOKEN, LLM_SEED, LLM_MAX_GPU_LAYERS
 from tools.prompts import initial_table_assistant_prefill
 # Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
 # Check for torch cuda
 print("Is CUDA enabled? ", torch.cuda.is_available())
@@ -54,13 +61,9 @@ batch_size_default = BATCH_SIZE_DEFAULT
 deduplication_threshold = DEDUPLICATION_THRESHOLD
 max_comment_character_length = MAX_COMMENT_CHARS
-# if RUN_AWS_FUNCTIONS == '1':
-#     bedrock_runtime = boto3.client('bedrock-runtime', region_name=AWS_REGION)
-# else:
-#     bedrock_runtime = []
 if not LLM_THREADS:
-    threads = torch.get_num_threads() # 8
 else: threads = LLM_THREADS
 print("CPU threads:", threads)
@@ -76,7 +79,8 @@ else: sample = False
 temperature = LLM_TEMPERATURE
 top_k = LLM_TOP_K
 top_p = LLM_TOP_P
-repetition_penalty = LLM_REPETITION_PENALTY # Mild repetition penalty to prevent repeating table rows
 last_n_tokens = LLM_LAST_N_TOKENS
 max_new_tokens: int = LLM_MAX_NEW_TOKENS
 seed: int = LLM_SEED
@@ -86,6 +90,7 @@ threads: int = threads
 batch_size:int = LLM_BATCH_SIZE
 context_length:int = LLM_CONTEXT_LENGTH
 sample = LLM_SAMPLE
 class llama_cpp_init_config_gpu:
     def __init__(self,
@@ -122,6 +127,7 @@ cpu_config = llama_cpp_init_config_cpu()
 class LlamaCPPGenerationConfig:
     def __init__(self, temperature=temperature,
                  top_k=top_k,
                  top_p=top_p,
                  repeat_penalty=repetition_penalty,
                  seed=seed,
@@ -164,6 +170,7 @@ def get_model_path(repo_id=LOCAL_REPO_ID, model_filename=LOCAL_MODEL_FILE, model
         if hf_token:
             downloaded_model_path = hf_hub_download(repo_id=repo_id, token=hf_token, filename=model_filename)
         else:
             downloaded_model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
         return downloaded_model_path
@@ -185,13 +192,16 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE, gpu_layers:int=gpu_
         try:
             print("GPU load variables:" , vars(gpu_config))
-            llama_model = Llama(model_path=model_path, type_k=8, type_v=8, flash_attn=True, **vars(gpu_config)) #  type_k=8, type_v = 8, flash_attn=True,
         except Exception as e:
             print("GPU load failed due to:", e)
-            llama_model = Llama(model_path=model_path, type_k=8, **vars(cpu_config)) # type_v = 8, flash_attn=True,
-        print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU. And a maximum context length of ", gpu_config.n_ctx)
     # CPU mode
     else:
@@ -202,11 +212,14 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE, gpu_layers:int=gpu_
         gpu_config.update_context(max_context_length)
         cpu_config.update_context(max_context_length)
-        llama_model = Llama(model_path=model_path, type_k=8, **vars(cpu_config)) # type_v = 8, flash_attn=True,
-        print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU. And a maximum context length of ", gpu_config.n_ctx)
-    tokenizer = []
     print("Finished loading model:", local_model_type)
     print("GPU layers assigned to cuda:", gpu_layers)
@@ -244,6 +257,47 @@ def call_llama_cpp_model(formatted_string:str, gen_config:str, model=model):
     return output
 # This function is not used in this app
 def llama_cpp_streaming(history, full_prompt, temperature=temperature):
@@ -392,7 +446,7 @@ def call_aws_claude(prompt: str, system_prompt: str, temperature: float, max_tok
     return response
 # Function to send a request and update history
-def send_request(prompt: str, conversation_history: List[dict], google_client: ai.Client, config: types.GenerateContentConfig, model_choice: str, system_prompt: str, temperature: float, bedrock_runtime:boto3.Session.client, model_source:str, local_model=[], assistant_prefill = "", progress=Progress(track_tqdm=True)) -> Tuple[str, List[dict]]:
     """
     This function sends a request to a language model with the given prompt, conversation history, model configuration, model choice, system prompt, and temperature.
     It constructs the full prompt by appending the new user prompt to the conversation history, generates a response from the model, and updates the conversation history with the new prompt and response.
@@ -421,16 +475,9 @@ def send_request(prompt: str, conversation_history: List[dict], google_client: a
         for i in progress_bar:
             try:
                 print("Calling Gemini model, attempt", i + 1)
-                #print("google_client:", google_client)
-                #print("model_choice:", model_choice)
-                #print("full_prompt:", full_prompt)
-                #print("generation_config:", config)
                 response = google_client.models.generate_content(model=model_choice, contents=full_prompt, config=config)
-                #progress_bar.close()
-                #tqdm._instances.clear()
                 print("Successful call to Gemini model.")
                 break
             except Exception as e:
@@ -447,9 +494,6 @@ def send_request(prompt: str, conversation_history: List[dict], google_client: a
                 print("Calling AWS Claude model, attempt", i + 1)
                 response = call_aws_claude(prompt, system_prompt, temperature, max_tokens, model_choice, bedrock_runtime=bedrock_runtime, assistant_prefill=assistant_prefill)
-                #progress_bar.close()
-                #tqdm._instances.clear()
                 print("Successful call to Claude model.")
                 break
             except Exception as e:
@@ -468,10 +512,7 @@ def send_request(prompt: str, conversation_history: List[dict], google_client: a
                 gen_config = LlamaCPPGenerationConfig()
                 gen_config.update_temp(temperature)
-                response = call_llama_cpp_model(prompt, gen_config, model=local_model)
-                #progress_bar.close()
-                #tqdm._instances.clear()
                 print("Successful call to local model. Response:", response)
                 break
@@ -492,7 +533,7 @@ def send_request(prompt: str, conversation_history: List[dict], google_client: a
     if isinstance(response, ResponseObject):
         conversation_history.append({'role': 'assistant', 'parts': [response.text]})
     elif 'choices' in response:
-        conversation_history.append({'role': 'assistant', 'parts': [response['choices'][0]['text']]})
     else:
         conversation_history.append({'role': 'assistant', 'parts': [response.text]})
@@ -501,7 +542,7 @@ def send_request(prompt: str, conversation_history: List[dict], google_client: a
     return response, conversation_history
-def process_requests(prompts: List[str], system_prompt: str, conversation_history: List[dict], whole_conversation: List[str], whole_conversation_metadata: List[str], google_client: ai.Client, config: types.GenerateContentConfig, model_choice: str, temperature: float, bedrock_runtime:boto3.Session.client, model_source:str, batch_no:int = 1, local_model = [], master:bool = False, assistant_prefill="") -> Tuple[List[ResponseObject], List[dict], List[str], List[str]]:
     """
     Processes a list of prompts by sending them to the model, appending the responses to the conversation history, and updating the whole conversation and metadata.
@@ -525,21 +566,19 @@ def process_requests(prompts: List[str], system_prompt: str, conversation_histor
     Returns:
         Tuple[List[ResponseObject], List[dict], List[str], List[str]]: A tuple containing the list of responses, the updated conversation history, the updated whole conversation, and the updated whole conversation metadata.
     """
-    responses = []
     # Clear any existing progress bars
     tqdm._instances.clear()
     for prompt in prompts:
-        #print("prompt to LLM:", prompt)
         response, conversation_history = send_request(prompt, conversation_history, google_client=google_client, config=config, model_choice=model_choice, system_prompt=system_prompt, temperature=temperature, local_model=local_model, assistant_prefill=assistant_prefill, bedrock_runtime=bedrock_runtime, model_source=model_source)
         if isinstance(response, ResponseObject):
             response_text = response.text
         elif 'choices' in response:
-            response_text = response['choices'][0]['text']
         else:
             response_text = response.text
@@ -550,9 +589,10 @@ def process_requests(prompts: List[str], system_prompt: str, conversation_histor
         # Create conversation metadata
         if master == False:
-            whole_conversation_metadata.append(f"Query batch {batch_no} prompt {len(responses)} metadata:")
         else:
-            whole_conversation_metadata.append(f"Query summary metadata:")
         if not isinstance(response, str):
             try:
@@ -571,6 +611,9 @@ def process_requests(prompts: List[str], system_prompt: str, conversation_histor
                 elif "gemini" in model_choice:
                     whole_conversation_metadata.append(str(response.usage_metadata))
                 else:
                     whole_conversation_metadata.append(str(response['usage']))
             except KeyError as e:
                 print(f"Key error: {e} - Check the structure of response.usage_metadata")
@@ -638,10 +681,14 @@ def call_llm_with_markdown_table_checks(batch_prompts: List[str],
             call_temperature, bedrock_runtime, model_source, reported_batch_no, local_model, master=master, assistant_prefill=assistant_prefill
         )
-        if model_choice != CHOSEN_LOCAL_MODEL_TYPE:
-            stripped_response = responses[-1].text.strip()
-        else:
-            stripped_response = responses[-1]['choices'][0]['text'].strip()
         # Check if response meets our criteria (length and contains table)
         if len(stripped_response) > 120 and '|' in stripped_response:
@@ -656,7 +703,7 @@ def call_llm_with_markdown_table_checks(batch_prompts: List[str],
     else:  # This runs if no break occurred (all attempts failed)
         print(f"Failed to get valid response after {MAX_OUTPUT_VALIDATION_ATTEMPTS} attempts")
-    return responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text
 def create_missing_references_df(basic_response_df: pd.DataFrame, existing_reference_df: pd.DataFrame) -> pd.DataFrame:
     """

 import json
 from tqdm import tqdm
 from huggingface_hub import hf_hub_download
+from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
 from typing import List, Tuple, TypeVar
 from google import genai as ai
 from google.genai import types
 model_type = None # global variable setup
 full_text = "" # Define dummy source text (full text) just to enable highlight function to load
+model = list() # Define empty list for model functions to run
+tokenizer = list() #[] # Define empty list for model functions to run
+from tools.config import RUN_AWS_FUNCTIONS, AWS_REGION, LLM_TEMPERATURE, LLM_TOP_K, LLM_MIN_P, LLM_TOP_P, LLM_REPETITION_PENALTY, LLM_LAST_N_TOKENS, LLM_MAX_NEW_TOKENS, LLM_SEED, LLM_RESET, LLM_STREAM, LLM_THREADS, LLM_BATCH_SIZE, LLM_CONTEXT_LENGTH, LLM_SAMPLE, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, MAX_COMMENT_CHARS, RUN_LOCAL_MODEL, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, HF_TOKEN, LLM_SEED, LLM_MAX_GPU_LAYERS, SPECULATIVE_DECODING, NUM_PRED_TOKENS
 from tools.prompts import initial_table_assistant_prefill
+if SPECULATIVE_DECODING == "True": SPECULATIVE_DECODING = True
+else: SPECULATIVE_DECODING = False
+if isinstance(NUM_PRED_TOKENS, str): NUM_PRED_TOKENS = int(NUM_PRED_TOKENS)
+else: NUM_PRED_TOKENS = NUM_PRED_TOKENS
 # Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
 # Check for torch cuda
 print("Is CUDA enabled? ", torch.cuda.is_available())
 deduplication_threshold = DEDUPLICATION_THRESHOLD
 max_comment_character_length = MAX_COMMENT_CHARS
 if not LLM_THREADS:
+    threads = torch.get_num_threads()
 else: threads = LLM_THREADS
 print("CPU threads:", threads)
 temperature = LLM_TEMPERATURE
 top_k = LLM_TOP_K
 top_p = LLM_TOP_P
+min_p = LLM_MIN_P
+repetition_penalty = LLM_REPETITION_PENALTY
 last_n_tokens = LLM_LAST_N_TOKENS
 max_new_tokens: int = LLM_MAX_NEW_TOKENS
 seed: int = LLM_SEED
 batch_size:int = LLM_BATCH_SIZE
 context_length:int = LLM_CONTEXT_LENGTH
 sample = LLM_SAMPLE
+speculative_decoding = SPECULATIVE_DECODING
 class llama_cpp_init_config_gpu:
     def __init__(self,
 class LlamaCPPGenerationConfig:
     def __init__(self, temperature=temperature,
                  top_k=top_k,
+                 min_p=min_p,
                  top_p=top_p,
                  repeat_penalty=repetition_penalty,
                  seed=seed,
         if hf_token:
             downloaded_model_path = hf_hub_download(repo_id=repo_id, token=hf_token, filename=model_filename)
         else:
+            print("No HF token found, downloading model from Hugging Face Hub without token")
             downloaded_model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
         return downloaded_model_path
         try:
             print("GPU load variables:" , vars(gpu_config))
+            if speculative_decoding:
+                llama_model = Llama(model_path=model_path, type_k=8, type_v=8, flash_attn=True, draft_model=LlamaPromptLookupDecoding(num_pred_tokens=NUM_PRED_TOKENS), **vars(gpu_config))
+            else:
+                llama_model = Llama(model_path=model_path, type_k=8, type_v=8, flash_attn=True, **vars(gpu_config))
         except Exception as e:
             print("GPU load failed due to:", e)
+            llama_model = Llama(model_path=model_path, type_k=8, **vars(cpu_config))
+        print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU and a maximum context length of", gpu_config.n_ctx)
     # CPU mode
     else:
         gpu_config.update_context(max_context_length)
         cpu_config.update_context(max_context_length)
+        if speculative_decoding:
+            llama_model = Llama(model_path=model_path, type_k=8, type_v=8, flash_attn=True, draft_model=LlamaPromptLookupDecoding(num_pred_tokens=NUM_PRED_TOKENS), **vars(gpu_config))
+        else:
+            llama_model = Llama(model_path=model_path, type_k=8, **vars(cpu_config))
+        print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU and a maximum context length of", gpu_config.n_ctx)
+    tokenizer = list()
     print("Finished loading model:", local_model_type)
     print("GPU layers assigned to cuda:", gpu_layers)
     return output
+def call_llama_cpp_chatmodel(formatted_string:str, system_prompt:str, gen_config:LlamaCPPGenerationConfig, model=model):
+    """
+    Calls your Llama.cpp chat model with a formatted user message and system prompt,
+    using generation parameters from the LlamaCPPGenerationConfig object.
+    Args:
+        formatted_string (str): The formatted input text for the user's message.
+        system_prompt (str): The system-level instructions for the model.
+        gen_config (LlamaCPPGenerationConfig): An object containing generation parameters.
+        model (Llama): The Llama.cpp model instance to use for chat completion.
+    """
+    # Extracting parameters from the gen_config object
+    temperature = gen_config.temperature
+    top_k = gen_config.top_k
+    top_p = gen_config.top_p
+    repeat_penalty = gen_config.repeat_penalty
+    seed = gen_config.seed
+    max_tokens = gen_config.max_tokens
+    stream = gen_config.stream
+    # Now you can call your model directly, passing the parameters:
+    output = model.create_chat_completion(
+        messages=[
+            {"role": "system", "content": system_prompt},
+            {
+                "role": "user",
+                "content": formatted_string
+            }
+        ],
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        repeat_penalty=repeat_penalty,
+        seed=seed,
+        max_tokens=max_tokens,
+        stream=stream
+        #stop=["<|eot_id|>", "\n\n"]
+    )
+    return output
 # This function is not used in this app
 def llama_cpp_streaming(history, full_prompt, temperature=temperature):
     return response
 # Function to send a request and update history
+def send_request(prompt: str, conversation_history: List[dict], google_client: ai.Client, config: types.GenerateContentConfig, model_choice: str, system_prompt: str, temperature: float, bedrock_runtime:boto3.Session.client, model_source:str, local_model= list(), assistant_prefill = "", progress=Progress(track_tqdm=True)) -> Tuple[str, List[dict]]:
     """
     This function sends a request to a language model with the given prompt, conversation history, model configuration, model choice, system prompt, and temperature.
     It constructs the full prompt by appending the new user prompt to the conversation history, generates a response from the model, and updates the conversation history with the new prompt and response.
         for i in progress_bar:
             try:
                 print("Calling Gemini model, attempt", i + 1)
                 response = google_client.models.generate_content(model=model_choice, contents=full_prompt, config=config)
                 print("Successful call to Gemini model.")
                 break
             except Exception as e:
                 print("Calling AWS Claude model, attempt", i + 1)
                 response = call_aws_claude(prompt, system_prompt, temperature, max_tokens, model_choice, bedrock_runtime=bedrock_runtime, assistant_prefill=assistant_prefill)
                 print("Successful call to Claude model.")
                 break
             except Exception as e:
                 gen_config = LlamaCPPGenerationConfig()
                 gen_config.update_temp(temperature)
+                response = call_llama_cpp_chatmodel(prompt, system_prompt, gen_config, model=local_model)
                 print("Successful call to local model. Response:", response)
                 break
     if isinstance(response, ResponseObject):
         conversation_history.append({'role': 'assistant', 'parts': [response.text]})
     elif 'choices' in response:
+        conversation_history.append({'role': 'assistant', 'parts': [response['choices'][0]['message']['content']]}) #response['choices'][0]['text']]})
     else:
         conversation_history.append({'role': 'assistant', 'parts': [response.text]})
     return response, conversation_history
+def process_requests(prompts: List[str], system_prompt: str, conversation_history: List[dict], whole_conversation: List[str], whole_conversation_metadata: List[str], google_client: ai.Client, config: types.GenerateContentConfig, model_choice: str, temperature: float, bedrock_runtime:boto3.Session.client, model_source:str, batch_no:int = 1, local_model = list(), master:bool = False, assistant_prefill="") -> Tuple[List[ResponseObject], List[dict], List[str], List[str]]:
     """
     Processes a list of prompts by sending them to the model, appending the responses to the conversation history, and updating the whole conversation and metadata.
     Returns:
         Tuple[List[ResponseObject], List[dict], List[str], List[str]]: A tuple containing the list of responses, the updated conversation history, the updated whole conversation, and the updated whole conversation metadata.
     """
+    responses = list()
     # Clear any existing progress bars
     tqdm._instances.clear()
     for prompt in prompts:
         response, conversation_history = send_request(prompt, conversation_history, google_client=google_client, config=config, model_choice=model_choice, system_prompt=system_prompt, temperature=temperature, local_model=local_model, assistant_prefill=assistant_prefill, bedrock_runtime=bedrock_runtime, model_source=model_source)
         if isinstance(response, ResponseObject):
             response_text = response.text
         elif 'choices' in response:
+            response_text = response['choices'][0]['message']['content'] # response['choices'][0]['text']
         else:
             response_text = response.text
         # Create conversation metadata
         if master == False:
+            whole_conversation_metadata.append(f"Batch {batch_no}:")
         else:
+            #whole_conversation_metadata.append(f"Query summary metadata:")
+            whole_conversation_metadata.append(f"Batch {batch_no}:")
         if not isinstance(response, str):
             try:
                 elif "gemini" in model_choice:
                     whole_conversation_metadata.append(str(response.usage_metadata))
                 else:
+                    print("Adding usage metadata to whole conversation metadata:", response['usage'])
+                    output_tokens = response['usage'].get('completion_tokens', 0)
+                    input_tokens = response['usage'].get('prompt_tokens', 0)
                     whole_conversation_metadata.append(str(response['usage']))
             except KeyError as e:
                 print(f"Key error: {e} - Check the structure of response.usage_metadata")
             call_temperature, bedrock_runtime, model_source, reported_batch_no, local_model, master=master, assistant_prefill=assistant_prefill
         )
+        #if model_source != "Local":
+            #stripped_response = responses[-1].text.strip()
+            #stripped_response = response_text.strip()
+        #else:
+            #stripped_response = response['choices'][0]['message']['content'].strip()
+            #stripped_response = response_text.strip()
+        stripped_response = response_text.strip()
         # Check if response meets our criteria (length and contains table)
         if len(stripped_response) > 120 and '|' in stripped_response:
     else:  # This runs if no break occurred (all attempts failed)
         print(f"Failed to get valid response after {MAX_OUTPUT_VALIDATION_ATTEMPTS} attempts")
+    return responses, conversation_history, whole_conversation, whole_conversation_metadata, stripped_response
 def create_missing_references_df(basic_response_df: pd.DataFrame, existing_reference_df: pd.DataFrame) -> pd.DataFrame:
     """

tools/verify_titles.py CHANGED Viewed

@@ -101,7 +101,7 @@ def write_llm_output_and_logs_verify(responses: List[ResponseObject],
     log_files_output_paths.append(whole_conversation_path_meta)
     if isinstance(responses[-1], ResponseObject): response_text =  responses[-1].text
-    elif "choices" in responses[-1]: response_text =  responses[-1]["choices"][0]['text']
     else: response_text =  responses[-1].text
     # Convert response text to a markdown table
@@ -464,13 +464,16 @@ def verify_titles(in_data_file,
                         if isinstance(responses[-1], ResponseObject):
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
-                                f.write(responses[-1].text)
                         elif "choices" in responses[-1]:
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
-                                f.write(responses[-1]["choices"][0]['text'])
                         else:
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
-                                f.write(responses[-1].text)
                     except Exception as e:
                         print("Error in returning model response:", e)
@@ -581,15 +584,18 @@ def verify_titles(in_data_file,
                         if isinstance(responses[-1], ResponseObject):
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
-                                f.write(responses[-1].text)
                             unique_table_df_display_table_markdown = responses[-1].text
                         elif "choices" in responses[-1]:
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
-                                f.write(responses[-1]["choices"][0]['text'])
-                            unique_table_df_display_table_markdown =responses[-1]["choices"][0]['text']
                         else:
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
-                                f.write(responses[-1].text)
                             unique_table_df_display_table_markdown = responses[-1].text
                         log_files_output_paths.append(final_table_output_path)

     log_files_output_paths.append(whole_conversation_path_meta)
     if isinstance(responses[-1], ResponseObject): response_text =  responses[-1].text
+    elif "choices" in responses[-1]: response_text =  responses[-1]['choices'][0]['message']['content'] #responses[-1]["choices"][0]['text']
     else: response_text =  responses[-1].text
     # Convert response text to a markdown table
                         if isinstance(responses[-1], ResponseObject):
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
+                                #f.write(responses[-1].text)
+                                f.write(response_text)
                         elif "choices" in responses[-1]:
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
+                                #f.write(responses[-1]["choices"][0]['text'])
+                                f.write(response_text)
                         else:
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
+                                #f.write(responses[-1].text)
+                                f.write(response_text)
                     except Exception as e:
                         print("Error in returning model response:", e)
                         if isinstance(responses[-1], ResponseObject):
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
+                                #f.write(responses[-1].text)
+                                f.write(response_text)
                             unique_table_df_display_table_markdown = responses[-1].text
                         elif "choices" in responses[-1]:
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
+                                #f.write(responses[-1]["choices"][0]['text'])
+                                f.write(response_text)
+                            unique_table_df_display_table_markdown =responses[-1]["choices"][0]['message']['content'] #responses[-1]["choices"][0]['text']
                         else:
                             with open(final_table_output_path, "w", encoding='utf-8-sig', errors='replace') as f:
+                                #f.write(responses[-1].text)
+                                f.write(response_text)
                             unique_table_df_display_table_markdown = responses[-1].text
                         log_files_output_paths.append(final_table_output_path)