Spaces:

seanpedrickcase
/

llm_topic_modelling

Running on Zero

App Files Files Community

seanpedrickcase commited on 9 days ago

Commit

72d517c

1 Parent(s): 8c54223

Enabled GPU-based local model inference with the transformers package

Browse files

Files changed (8) hide show

app.py +12 -7
requirements.txt +2 -0
requirements_gpu.txt +4 -2
tools/config.py +25 -0
tools/dedup_summaries.py +16 -10
tools/llm_api_call.py +11 -7
tools/llm_funcs.py +226 -38
tools/verify_titles.py +2 -2

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from tools.custom_csvlogger import CSVLogger_custom
 from tools.auth import authenticate_user
 from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, verify_titles_prompt, verify_titles_system_prompt, two_para_summary_format_prompt, single_para_summary_format_prompt
 from tools.verify_titles import verify_titles
-from tools.config import RUN_AWS_FUNCTIONS, HOST_NAME, ACCESS_LOGS_FOLDER, FEEDBACK_LOGS_FOLDER, USAGE_LOGS_FOLDER, RUN_LOCAL_MODEL,  FILE_INPUT_HEIGHT, GEMINI_API_KEY, model_full_names, BATCH_SIZE_DEFAULT, CHOSEN_LOCAL_MODEL_TYPE, LLM_SEED, COGNITO_AUTH, MAX_QUEUE_SIZE, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, INPUT_FOLDER, OUTPUT_FOLDER, S3_LOG_BUCKET, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, model_name_map, GET_COST_CODES, ENFORCE_COST_CODES, DEFAULT_COST_CODE, COST_CODES_PATH, S3_COST_CODES_PATH, OUTPUT_COST_CODES_PATH, SHOW_COSTS, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, USAGE_LOG_FILE_NAME, CSV_ACCESS_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, DYNAMODB_ACCESS_LOG_HEADERS, DYNAMODB_FEEDBACK_LOG_HEADERS, DYNAMODB_USAGE_LOG_HEADERS, S3_ACCESS_LOGS_FOLDER, S3_FEEDBACK_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, AWS_ACCESS_KEY, AWS_SECRET_KEY, SHOW_EXAMPLES
 def ensure_folder_exists(output_folder:str):
     """Checks if the specified folder exists, creates it if not."""
@@ -148,14 +148,14 @@ with app:
     Instructions on use can be found in the README.md file. Try it out with this [dummy development consultation dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation/tree/main), which you can also try with [zero-shot topics](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation/tree/main). Try also this [dummy case notes dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_case_notes/tree/main).
-    You can use an AWS Bedrock model (paid), or Gemini (a free API for Flash). The use of Gemini requires an API key. To set up your own Gemini API key, go [here](https://aistudio.google.com/app/u/1/plan_information).
     NOTE: Large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
     with gr.Tab(label="1. Extract topics"):
         gr.Markdown("""### Choose a tabular data file (xlsx, csv, parquet) of open text to extract topics from.""")
         with gr.Row():
-            model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model", multiselect=False)
         with gr.Accordion("Upload xlsx or csv file", open = True):
             in_data_files = gr.File(height=FILE_INPUT_HEIGHT, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
@@ -177,6 +177,9 @@ with app:
         sentiment_checkbox = gr.Radio(label="Choose sentiment categories to split responses", value="Negative or Positive", choices=["Negative or Positive", "Negative, Neutral, or Positive", "Do not assess sentiment"])
         if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
             with gr.Accordion("Assign task to cost code", open = True, visible=True):
                 gr.Markdown("Please ensure that you have approval from your budget holder before using this app for redaction tasks that incur a cost.")
@@ -188,9 +191,6 @@ with app:
         all_in_one_btn = gr.Button("All in one - Extract topics, deduplicate, and summarise", variant="primary")
         extract_topics_btn = gr.Button("1. Extract topics", variant="secondary")
-        if SHOW_EXAMPLES == "True":
-            examples = gr.Examples(examples=[[["example_data/dummy_consultation_response.csv"]], [["example_data/combined_case_notes.csv"]]], inputs=[in_data_files])
         with gr.Row(equal_height=True):
             output_messages_textbox = gr.Textbox(value="", label="Output messages", scale=1, interactive=False)
@@ -307,6 +307,9 @@ with app:
         with gr.Accordion("Gemini API keys", open = False):
             google_api_key_textbox = gr.Textbox(value = GEMINI_API_KEY, label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
         with gr.Accordion("Log outputs", open = False):
             log_files_output = gr.File(height=FILE_INPUT_HEIGHT, label="Log file output", interactive=False)
             conversation_metadata_textbox = gr.Textbox(value="", label="Query metadata - usage counts and other parameters", lines=8)
@@ -350,7 +353,7 @@ with app:
     ###
     # Tabular data upload
-    in_data_files.change(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, original_data_file_name_textbox, join_colnames, in_group_col])
     # Click on cost code dataframe/dropdown fills in cost code textbox
     # Allow user to select items from cost code dataframe for cost code
@@ -401,6 +404,7 @@ with app:
                 produce_structures_summary_radio,
                 aws_access_key_textbox,
                 aws_secret_key_textbox,
                 output_folder_state],
         outputs=[display_topic_table_markdown,
                 master_topic_df_state,
@@ -498,6 +502,7 @@ with app:
                 produce_structures_summary_radio,
                 aws_access_key_textbox,
                 aws_secret_key_textbox,
                 output_folder_state],
         outputs=[display_topic_table_markdown,
                 master_topic_df_state,

 from tools.auth import authenticate_user
 from tools.prompts import initial_table_prompt, prompt2, prompt3, system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt, verify_titles_prompt, verify_titles_system_prompt, two_para_summary_format_prompt, single_para_summary_format_prompt
 from tools.verify_titles import verify_titles
+from tools.config import RUN_AWS_FUNCTIONS, HOST_NAME, ACCESS_LOGS_FOLDER, FEEDBACK_LOGS_FOLDER, USAGE_LOGS_FOLDER, RUN_LOCAL_MODEL,  FILE_INPUT_HEIGHT, GEMINI_API_KEY, model_full_names, BATCH_SIZE_DEFAULT, CHOSEN_LOCAL_MODEL_TYPE, LLM_SEED, COGNITO_AUTH, MAX_QUEUE_SIZE, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, INPUT_FOLDER, OUTPUT_FOLDER, S3_LOG_BUCKET, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, model_name_map, GET_COST_CODES, ENFORCE_COST_CODES, DEFAULT_COST_CODE, COST_CODES_PATH, S3_COST_CODES_PATH, OUTPUT_COST_CODES_PATH, SHOW_COSTS, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, USAGE_LOG_FILE_NAME, CSV_ACCESS_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, DYNAMODB_ACCESS_LOG_HEADERS, DYNAMODB_FEEDBACK_LOG_HEADERS, DYNAMODB_USAGE_LOG_HEADERS, S3_ACCESS_LOGS_FOLDER, S3_FEEDBACK_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER, AWS_ACCESS_KEY, AWS_SECRET_KEY, SHOW_EXAMPLES, HF_TOKEN
 def ensure_folder_exists(output_folder:str):
     """Checks if the specified folder exists, creates it if not."""
     Instructions on use can be found in the README.md file. Try it out with this [dummy development consultation dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation/tree/main), which you can also try with [zero-shot topics](https://huggingface.co/datasets/seanpedrickcase/dummy_development_consultation/tree/main). Try also this [dummy case notes dataset](https://huggingface.co/datasets/seanpedrickcase/dummy_case_notes/tree/main).
+    You can use an AWS Bedrock model (paid), or Gemini (a free API for Flash). The use of Gemini requires an API key. To set up your own Gemini API key, go [here](https://aistudio.google.com/app/u/1/plan_information).
     NOTE: Large language models are not 100% accurate and may produce biased or harmful outputs. All outputs from this app **absolutely need to be checked by a human** to check for harmful outputs, hallucinations, and accuracy.""")
     with gr.Tab(label="1. Extract topics"):
         gr.Markdown("""### Choose a tabular data file (xlsx, csv, parquet) of open text to extract topics from.""")
         with gr.Row():
+            model_choice = gr.Dropdown(value = default_model_choice, choices = model_full_names, label="LLM model", multiselect=False)
         with gr.Accordion("Upload xlsx or csv file", open = True):
             in_data_files = gr.File(height=FILE_INPUT_HEIGHT, label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet'])
         sentiment_checkbox = gr.Radio(label="Choose sentiment categories to split responses", value="Negative or Positive", choices=["Negative or Positive", "Negative, Neutral, or Positive", "Do not assess sentiment"])
+        if SHOW_EXAMPLES == "True":
+            examples = gr.Examples(examples=[[["example_data/dummy_consultation_response.csv"], "Response text", "Consultation for the construction of flats on Main Street"], [["example_data/combined_case_notes.csv"], "Case Note",  "Social Care case notes for young people"]], inputs=[in_data_files, in_colnames, context_textbox], example_labels=["Consultation for the construction of flats on Main Street", "Social Care case notes for young people"], label="Test with an example dataset")
         if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
             with gr.Accordion("Assign task to cost code", open = True, visible=True):
                 gr.Markdown("Please ensure that you have approval from your budget holder before using this app for redaction tasks that incur a cost.")
         all_in_one_btn = gr.Button("All in one - Extract topics, deduplicate, and summarise", variant="primary")
         extract_topics_btn = gr.Button("1. Extract topics", variant="secondary")
         with gr.Row(equal_height=True):
             output_messages_textbox = gr.Textbox(value="", label="Output messages", scale=1, interactive=False)
         with gr.Accordion("Gemini API keys", open = False):
             google_api_key_textbox = gr.Textbox(value = GEMINI_API_KEY, label="Enter Gemini API key (only if using Google API models)", lines=1, type="password")
+        with gr.Accordion("Hugging Face API keys", open = False):
+            hf_api_key_textbox = gr.Textbox(value = HF_TOKEN, label="Enter Hugging Face API key (only if using Hugging Face models)", lines=1, type="password")
         with gr.Accordion("Log outputs", open = False):
             log_files_output = gr.File(height=FILE_INPUT_HEIGHT, label="Log file output", interactive=False)
             conversation_metadata_textbox = gr.Textbox(value="", label="Query metadata - usage counts and other parameters", lines=8)
     ###
     # Tabular data upload
+    in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets, original_data_file_name_textbox, join_colnames, in_group_col])
     # Click on cost code dataframe/dropdown fills in cost code textbox
     # Allow user to select items from cost code dataframe for cost code
                 produce_structures_summary_radio,
                 aws_access_key_textbox,
                 aws_secret_key_textbox,
+                hf_api_key_textbox,
                 output_folder_state],
         outputs=[display_topic_table_markdown,
                 master_topic_df_state,
                 produce_structures_summary_radio,
                 aws_access_key_textbox,
                 aws_secret_key_textbox,
+                hf_api_key_textbox,
                 output_folder_state],
         outputs=[display_topic_table_markdown,
                 master_topic_df_state,

requirements.txt CHANGED Viewed

@@ -18,6 +18,8 @@ python-dotenv==1.1.0
 # GPU
 torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu124 # Latest compatible with CUDA 12.4
 https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl
 # CPU only (for e.g. Hugging Face CPU instances)
 #torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/cpu
 # For Hugging Face, need a python 3.10 compatible wheel for llama-cpp-python to avoid build timeouts

 # GPU
 torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu124 # Latest compatible with CUDA 12.4
 https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl
+bitsandbytes==0.47.0
+accelerate==1.10.1
 # CPU only (for e.g. Hugging Face CPU instances)
 #torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/cpu
 # For Hugging Face, need a python 3.10 compatible wheel for llama-cpp-python to avoid build timeouts

requirements_gpu.txt CHANGED Viewed

@@ -17,10 +17,12 @@ python-dotenv==1.1.0
 # Torch and Llama CPP Python
 torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu124 # Latest compatible with CUDA 12.4
 # For Linux:
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp311-cp311-linux_x86_64.whl
 # For Windows:
-#https://github.com/seanpedrick-case/llama-cpp-python-whl-builder/releases/download/v0.1.0/llama_cpp_python-0.3.16-cp311-cp311-win_amd64.whl
 # If above doesn't work for Windows, try looking at'windows_install_llama-cpp-python.txt' for instructions on how to build from source
 # If none of the above work for you, try the following:
 # llama-cpp-python==0.3.16 -C cmake.args="-DGGML_CUDA=on -DGGML_CUBLAS=on"

 # Torch and Llama CPP Python
 torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu124 # Latest compatible with CUDA 12.4
 # For Linux:
+#https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp311-cp311-linux_x86_64.whl
 # For Windows:
+https://github.com/seanpedrick-case/llama-cpp-python-whl-builder/releases/download/v0.1.0/llama_cpp_python-0.3.16-cp311-cp311-win_amd64.whl
 # If above doesn't work for Windows, try looking at'windows_install_llama-cpp-python.txt' for instructions on how to build from source
 # If none of the above work for you, try the following:
 # llama-cpp-python==0.3.16 -C cmake.args="-DGGML_CUDA=on -DGGML_CUBLAS=on"
+bitsandbytes==0.47.0
+accelerate==1.10.1

tools/config.py CHANGED Viewed

@@ -241,20 +241,38 @@ model_name_map = {
 # HF token may or may not be needed for downloading models from Hugging Face
 HF_TOKEN = get_or_create_env_var('HF_TOKEN', '')
 GEMMA2_REPO_ID = get_or_create_env_var("GEMMA2_2B_REPO_ID", "lmstudio-community/gemma-2-2b-it-GGUF")
 GEMMA2_MODEL_FILE = get_or_create_env_var("GEMMA2_2B_MODEL_FILE", "gemma-2-2b-it-Q8_0.gguf")
 GEMMA2_MODEL_FOLDER = get_or_create_env_var("GEMMA2_2B_MODEL_FOLDER", "model/gemma")
 GEMMA3_REPO_ID = get_or_create_env_var("GEMMA3_REPO_ID", "unsloth/gemma-3-270m-it-qat-GGUF")
 GEMMA3_MODEL_FILE = get_or_create_env_var("GEMMA3_MODEL_FILE", "gemma-3-270m-it-qat-F16.gguf")
 GEMMA3_MODEL_FOLDER = get_or_create_env_var("GEMMA3_MODEL_FOLDER", "model/gemma")
 GEMMA3_4B_REPO_ID = get_or_create_env_var("GEMMA3_4B_REPO_ID", "unsloth/gemma-3-4b-it-qat-GGUF")
 GEMMA3_4B_MODEL_FILE = get_or_create_env_var("GEMMA3_4B_MODEL_FILE", "gemma-3-4b-it-qat-Q4_K_M.gguf")
 GEMMA3_4B_MODEL_FOLDER = get_or_create_env_var("GEMMA3_4B_MODEL_FOLDER", "model/gemma3_4b")
 GPT_OSS_REPO_ID = get_or_create_env_var("GPT_OSS_REPO_ID", "unsloth/gpt-oss-20b-GGUF")
 GPT_OSS_MODEL_FILE = get_or_create_env_var("GPT_OSS_MODEL_FILE", "gpt-oss-20b-F16.gguf")
 GPT_OSS_MODEL_FOLDER = get_or_create_env_var("GPT_OSS_MODEL_FOLDER", "model/gpt_oss")
@@ -305,6 +323,13 @@ SPECULATIVE_DECODING = get_or_create_env_var('SPECULATIVE_DECODING', 'False')
 NUM_PRED_TOKENS = int(get_or_create_env_var('NUM_PRED_TOKENS', '2'))
 REASONING_SUFFIX = get_or_create_env_var('REASONING_SUFFIX', 'Reasoning: low')
 MAX_GROUPS = int(get_or_create_env_var('MAX_GROUPS', '99'))
 ###

 # HF token may or may not be needed for downloading models from Hugging Face
 HF_TOKEN = get_or_create_env_var('HF_TOKEN', '')
+USE_LLAMA_CPP = get_or_create_env_var('USE_LLAMA_CPP', 'True') # Llama.cpp or transformers
 GEMMA2_REPO_ID = get_or_create_env_var("GEMMA2_2B_REPO_ID", "lmstudio-community/gemma-2-2b-it-GGUF")
+GEMMA2_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA2_2B_REPO_TRANSFORMERS_ID", "google/gemma-2-2b-it")
+if USE_LLAMA_CPP == "False":
+    GEMMA2_REPO_ID = GEMMA2_REPO_TRANSFORMERS_ID
 GEMMA2_MODEL_FILE = get_or_create_env_var("GEMMA2_2B_MODEL_FILE", "gemma-2-2b-it-Q8_0.gguf")
 GEMMA2_MODEL_FOLDER = get_or_create_env_var("GEMMA2_2B_MODEL_FOLDER", "model/gemma")
 GEMMA3_REPO_ID = get_or_create_env_var("GEMMA3_REPO_ID", "unsloth/gemma-3-270m-it-qat-GGUF")
+GEMMA3_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA3_REPO_TRANSFORMERS_ID", "google/gemma-3-270m-it")
+if USE_LLAMA_CPP == "False":
+    GEMMA3_REPO_ID = GEMMA3_REPO_TRANSFORMERS_ID
 GEMMA3_MODEL_FILE = get_or_create_env_var("GEMMA3_MODEL_FILE", "gemma-3-270m-it-qat-F16.gguf")
 GEMMA3_MODEL_FOLDER = get_or_create_env_var("GEMMA3_MODEL_FOLDER", "model/gemma")
 GEMMA3_4B_REPO_ID = get_or_create_env_var("GEMMA3_4B_REPO_ID", "unsloth/gemma-3-4b-it-qat-GGUF")
+GEMMA3_4B_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA3_4B_REPO_TRANSFORMERS_ID", "google/gemma-3-4b-it")
+if USE_LLAMA_CPP == "False":
+    GEMMA3_4B_REPO_ID = GEMMA3_4B_REPO_TRANSFORMERS_ID
 GEMMA3_4B_MODEL_FILE = get_or_create_env_var("GEMMA3_4B_MODEL_FILE", "gemma-3-4b-it-qat-Q4_K_M.gguf")
 GEMMA3_4B_MODEL_FOLDER = get_or_create_env_var("GEMMA3_4B_MODEL_FOLDER", "model/gemma3_4b")
 GPT_OSS_REPO_ID = get_or_create_env_var("GPT_OSS_REPO_ID", "unsloth/gpt-oss-20b-GGUF")
+GPT_OSS_REPO_TRANSFORMERS_ID = get_or_create_env_var("GPT_OSS_REPO_TRANSFORMERS_ID", "openai/gpt-oss-20b")
+if USE_LLAMA_CPP == "False":
+    GPT_OSS_REPO_ID = GPT_OSS_REPO_TRANSFORMERS_ID
 GPT_OSS_MODEL_FILE = get_or_create_env_var("GPT_OSS_MODEL_FILE", "gpt-oss-20b-F16.gguf")
 GPT_OSS_MODEL_FOLDER = get_or_create_env_var("GPT_OSS_MODEL_FOLDER", "model/gpt_oss")
 NUM_PRED_TOKENS = int(get_or_create_env_var('NUM_PRED_TOKENS', '2'))
 REASONING_SUFFIX = get_or_create_env_var('REASONING_SUFFIX', 'Reasoning: low')
+# Transformers variables
+COMPILE_TRANSFORMERS = get_or_create_env_var('COMPILE_TRANSFORMERS', 'True') # Whether to compile transformers models
+USE_BITSANDBYTES = get_or_create_env_var('USE_BITSANDBYTES', 'True') # Whether to use bitsandbytes for quantization
+COMPILE_MODE = get_or_create_env_var('COMPILE_MODE', 'reduce-overhead') # alternatively 'max-autotune'
+MODEL_DTYPE = get_or_create_env_var('MODEL_DTYPE', 'float16') # alternatively 'bfloat16'
+OFFLOAD_TO_CPU = get_or_create_env_var('OFFLOAD_TO_CPU', 'False') # Whether to offload to CPU
 MAX_GROUPS = int(get_or_create_env_var('MAX_GROUPS', '99'))
 ###

tools/dedup_summaries.py CHANGED Viewed

@@ -415,7 +415,7 @@ def sample_reference_table_summaries(reference_df:pd.DataFrame,
     return sampled_reference_table_df, summarised_references_markdown#, reference_df, topic_summary_df
-def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:float, formatted_summary_prompt:str, summarise_topic_descriptions_system_prompt:str, model_source:str, bedrock_runtime:boto3.Session.client, local_model=list()):
     """
     Query an LLM to generate a summary of topics based on the provided prompts.
@@ -428,7 +428,7 @@ def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:
         model_source (str): Source of the model (e.g. "AWS", "Gemini", "Local")
         bedrock_runtime (boto3.Session.client): AWS Bedrock runtime client for AWS models
         local_model (object, optional): Local model object if using local inference. Defaults to empty list.
     Returns:
         tuple: Contains:
             - response_text (str): The generated summary text
@@ -454,7 +454,7 @@ def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:
     whole_conversation = [summarise_topic_descriptions_system_prompt]
     # Process requests to large language model
-    responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(formatted_summary_prompt, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, bedrock_runtime=bedrock_runtime, model_source=model_source, local_model=local_model, assistant_prefill=summary_assistant_prefill)
     print("Finished summary query")
@@ -482,7 +482,9 @@ def summarise_output_topics(sampled_reference_table_df:pd.DataFrame,
                             aws_secret_key_textbox:str='',
                             model_name_map:dict=model_name_map,
                             reasoning_suffix:str=reasoning_suffix,
-                            local_model:object=list(),
                             summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt,
                             summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
                             do_summaries:str="Yes",
@@ -572,7 +574,7 @@ def summarise_output_topics(sampled_reference_table_df:pd.DataFrame,
     if (model_source == "Local") & (RUN_LOCAL_MODEL == "1"):
         progress(0.1, f"Loading in local model: {CHOSEN_LOCAL_MODEL_TYPE}")
-        local_model, tokenizer = load_model(local_model_type=CHOSEN_LOCAL_MODEL_TYPE, repo_id=LOCAL_REPO_ID, model_filename=LOCAL_MODEL_FILE, model_dir=LOCAL_MODEL_FOLDER)
     summary_loop_description = "Revising topic-level summaries. " + str(latest_summary_completed) + " summaries completed so far."
     summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Revising topic-level summaries", unit="summaries")
@@ -592,7 +594,7 @@ def summarise_output_topics(sampled_reference_table_df:pd.DataFrame,
             if "Local" in model_source and reasoning_suffix: formatted_summarise_topic_descriptions_system_prompt = formatted_summarise_topic_descriptions_system_prompt + "\n" + reasoning_suffix
             try:
-                response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, formatted_summarise_topic_descriptions_system_prompt, model_source, bedrock_runtime, local_model)
                 summarised_output = response
                 summarised_output = re.sub(r'\n{2,}', '\n', summarised_output)  # Replace multiple line breaks with a single line break
                 summarised_output = re.sub(r'^\n{1,}', '', summarised_output)  # Remove one or more line breaks at the start
@@ -697,7 +699,9 @@ def overall_summary(topic_summary_df:pd.DataFrame,
                     aws_secret_key_textbox:str='',
                     model_name_map:dict=model_name_map,
                     reasoning_suffix:str=reasoning_suffix,
-                    local_model:object=list(),
                     summarise_everything_prompt:str=summarise_everything_prompt,
                     comprehensive_summary_format_prompt:str=comprehensive_summary_format_prompt,
                     comprehensive_summary_format_prompt_by_group:str=comprehensive_summary_format_prompt_by_group,
@@ -721,6 +725,8 @@ def overall_summary(topic_summary_df:pd.DataFrame,
         model_name_map (dict, optional): Mapping of model names. Defaults to model_name_map.
         reasoning_suffix (str, optional): Suffix for reasoning. Defaults to reasoning_suffix.
         local_model (object, optional): Local model object. Defaults to empty list.
         summarise_everything_prompt (str, optional): Prompt for overall summary
         comprehensive_summary_format_prompt (str, optional): Prompt for comprehensive summary format
         comprehensive_summary_format_prompt_by_group (str, optional): Prompt for group summary format
@@ -795,7 +801,7 @@ def overall_summary(topic_summary_df:pd.DataFrame,
     if (model_choice == CHOSEN_LOCAL_MODEL_TYPE) & (RUN_LOCAL_MODEL == "1"):
                 progress(0.1, f"Loading in local model: {CHOSEN_LOCAL_MODEL_TYPE}")
-                local_model, tokenizer = load_model(local_model_type=CHOSEN_LOCAL_MODEL_TYPE, repo_id=LOCAL_REPO_ID, model_filename=LOCAL_MODEL_FILE, model_dir=LOCAL_MODEL_FOLDER)
                 #print("Local model loaded:", local_model)
     summary_loop = tqdm(unique_groups, desc="Creating overall summary for groups", unit="groups")
@@ -806,7 +812,7 @@ def overall_summary(topic_summary_df:pd.DataFrame,
         for summary_group in summary_loop:
-            print("Creating overallsummary for group:", summary_group)
             summary_text = topic_summary_df.loc[topic_summary_df["Group"]==summary_group].to_markdown(index=False)
@@ -817,7 +823,7 @@ def overall_summary(topic_summary_df:pd.DataFrame,
             if "Local" in model_source and reasoning_suffix: formatted_summarise_everything_system_prompt = formatted_summarise_everything_system_prompt + "\n" + reasoning_suffix
             try:
-                response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, formatted_summarise_everything_system_prompt, model_source, bedrock_runtime, local_model)
                 summarised_output_for_df = response
                 summarised_output = response
                 summarised_output = re.sub(r'\n{2,}', '\n', summarised_output)  # Replace multiple line breaks with a single line break

     return sampled_reference_table_df, summarised_references_markdown#, reference_df, topic_summary_df
+def summarise_output_topics_query(model_choice:str, in_api_key:str, temperature:float, formatted_summary_prompt:str, summarise_topic_descriptions_system_prompt:str, model_source:str, bedrock_runtime:boto3.Session.client, local_model=list(), tokenizer=list()):
     """
     Query an LLM to generate a summary of topics based on the provided prompts.
         model_source (str): Source of the model (e.g. "AWS", "Gemini", "Local")
         bedrock_runtime (boto3.Session.client): AWS Bedrock runtime client for AWS models
         local_model (object, optional): Local model object if using local inference. Defaults to empty list.
+        tokenizer (object, optional): Tokenizer object if using local inference. Defaults to empty list.
     Returns:
         tuple: Contains:
             - response_text (str): The generated summary text
     whole_conversation = [summarise_topic_descriptions_system_prompt]
     # Process requests to large language model
+    responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(formatted_summary_prompt, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, bedrock_runtime=bedrock_runtime, model_source=model_source, local_model=local_model, tokenizer=tokenizer, assistant_prefill=summary_assistant_prefill)
     print("Finished summary query")
                             aws_secret_key_textbox:str='',
                             model_name_map:dict=model_name_map,
                             reasoning_suffix:str=reasoning_suffix,
+                            local_model:object=list(),
+                            tokenizer:object=list(),
+                            hf_api_key_textbox:str='',
                             summarise_topic_descriptions_prompt:str=summarise_topic_descriptions_prompt,
                             summarise_topic_descriptions_system_prompt:str=summarise_topic_descriptions_system_prompt,
                             do_summaries:str="Yes",
     if (model_source == "Local") & (RUN_LOCAL_MODEL == "1"):
         progress(0.1, f"Loading in local model: {CHOSEN_LOCAL_MODEL_TYPE}")
+        local_model, tokenizer = load_model(local_model_type=CHOSEN_LOCAL_MODEL_TYPE, repo_id=LOCAL_REPO_ID, model_filename=LOCAL_MODEL_FILE, model_dir=LOCAL_MODEL_FOLDER, hf_token=hf_api_key_textbox)
     summary_loop_description = "Revising topic-level summaries. " + str(latest_summary_completed) + " summaries completed so far."
     summary_loop = tqdm(range(latest_summary_completed, length_all_summaries), desc="Revising topic-level summaries", unit="summaries")
             if "Local" in model_source and reasoning_suffix: formatted_summarise_topic_descriptions_system_prompt = formatted_summarise_topic_descriptions_system_prompt + "\n" + reasoning_suffix
             try:
+                response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, formatted_summarise_topic_descriptions_system_prompt, model_source, bedrock_runtime, local_model, tokenizer=tokenizer)
                 summarised_output = response
                 summarised_output = re.sub(r'\n{2,}', '\n', summarised_output)  # Replace multiple line breaks with a single line break
                 summarised_output = re.sub(r'^\n{1,}', '', summarised_output)  # Remove one or more line breaks at the start
                     aws_secret_key_textbox:str='',
                     model_name_map:dict=model_name_map,
                     reasoning_suffix:str=reasoning_suffix,
+                    local_model:object=list(),
+                    tokenizer:object=list(),
+                    hf_api_key_textbox:str='',
                     summarise_everything_prompt:str=summarise_everything_prompt,
                     comprehensive_summary_format_prompt:str=comprehensive_summary_format_prompt,
                     comprehensive_summary_format_prompt_by_group:str=comprehensive_summary_format_prompt_by_group,
         model_name_map (dict, optional): Mapping of model names. Defaults to model_name_map.
         reasoning_suffix (str, optional): Suffix for reasoning. Defaults to reasoning_suffix.
         local_model (object, optional): Local model object. Defaults to empty list.
+        tokenizer (object, optional): Tokenizer object. Defaults to empty list.
+        hf_api_key_textbox (str, optional): Hugging Face API key. Defaults to empty string.
         summarise_everything_prompt (str, optional): Prompt for overall summary
         comprehensive_summary_format_prompt (str, optional): Prompt for comprehensive summary format
         comprehensive_summary_format_prompt_by_group (str, optional): Prompt for group summary format
     if (model_choice == CHOSEN_LOCAL_MODEL_TYPE) & (RUN_LOCAL_MODEL == "1"):
                 progress(0.1, f"Loading in local model: {CHOSEN_LOCAL_MODEL_TYPE}")
+                local_model, tokenizer = load_model(local_model_type=CHOSEN_LOCAL_MODEL_TYPE, repo_id=LOCAL_REPO_ID, model_filename=LOCAL_MODEL_FILE, model_dir=LOCAL_MODEL_FOLDER, hf_token=hf_api_key_textbox)
                 #print("Local model loaded:", local_model)
     summary_loop = tqdm(unique_groups, desc="Creating overall summary for groups", unit="groups")
         for summary_group in summary_loop:
+            print("Creating overall summary for group:", summary_group)
             summary_text = topic_summary_df.loc[topic_summary_df["Group"]==summary_group].to_markdown(index=False)
             if "Local" in model_source and reasoning_suffix: formatted_summarise_everything_system_prompt = formatted_summarise_everything_system_prompt + "\n" + reasoning_suffix
             try:
+                response, conversation_history, metadata = summarise_output_topics_query(model_choice, in_api_key, temperature, formatted_summary_prompt, formatted_summarise_everything_system_prompt, model_source, bedrock_runtime, local_model, tokenizer=tokenizer)
                 summarised_output_for_df = response
                 summarised_output = response
                 summarised_output = re.sub(r'\n{2,}', '\n', summarised_output)  # Replace multiple line breaks with a single line break

tools/llm_api_call.py CHANGED Viewed

@@ -688,6 +688,7 @@ def extract_topics(in_data_file: GradioFileData,
               produce_structures_summary_radio:str="No",
               aws_access_key_textbox:str='',
               aws_secret_key_textbox:str='',
               max_tokens:int=max_tokens,
               model_name_map:dict=model_name_map,
               max_time_for_loop:int=max_time_for_loop,
@@ -737,6 +738,7 @@ def extract_topics(in_data_file: GradioFileData,
     - force_single_topic_prompt (str, optional): The prompt for forcing the model to assign only one single topic to each response.
     - aws_access_key_textbox (str, optional): AWS access key for account with Bedrock permissions.
     - aws_secret_key_textbox (str, optional): AWS secret key for account with Bedrock permissions.
     - max_tokens (int): The maximum number of tokens for the model.
     - model_name_map (dict, optional): A dictionary mapping full model name to shortened.
     - max_time_for_loop (int, optional): The number of seconds maximum that the function should run for before breaking (to run again, this is to avoid timeouts with some AWS services if deployed there).
@@ -808,7 +810,7 @@ def extract_topics(in_data_file: GradioFileData,
             if (model_source == "Local") & (RUN_LOCAL_MODEL == "1"):
                 progress(0.1, f"Loading in local model: {CHOSEN_LOCAL_MODEL_TYPE}")
-                local_model, tokenizer = load_model(local_model_type=CHOSEN_LOCAL_MODEL_TYPE, repo_id=LOCAL_REPO_ID, model_filename=LOCAL_MODEL_FILE, model_dir=LOCAL_MODEL_FOLDER)
     if num_batches > 0:
         progress_measure = round(latest_batch_completed / num_batches, 1)
@@ -938,9 +940,9 @@ def extract_topics(in_data_file: GradioFileData,
                         formatted_summary_prompt = structured_summary_prompt.format(response_table=normalised_simple_markdown_table,
                                                                                     topics=unique_topics_markdown)
-                    if "gemma" in model_choice:
-                        formatted_summary_prompt = llama_cpp_prefix + formatted_system_prompt + "\n" + formatted_summary_prompt + llama_cpp_suffix
-                        full_prompt = formatted_summary_prompt
                     else:
                         full_prompt = formatted_system_prompt + "\n" + formatted_summary_prompt
@@ -970,7 +972,7 @@ def extract_topics(in_data_file: GradioFileData,
                     whole_conversation = list()
                     # Process requests to large language model
-                    responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, formatted_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=add_existing_topics_assistant_prefill, master = True)
                     # Return output tables
                     topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, new_topic_df, new_reference_df, new_topic_summary_df, master_batch_out_file_part, is_error = write_llm_output_and_logs(response_text, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, model_name_map, group_name, produce_structures_summary_radio, first_run=False, output_folder=output_folder)
@@ -1030,7 +1032,7 @@ def extract_topics(in_data_file: GradioFileData,
                     formatted_initial_table_system_prompt = initial_table_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
                     # Prepare Gemini models before query
-                    if "gemini" in model_choice:
                         print("Using Gemini model:", model_choice)
                         google_client, google_config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=formatted_initial_table_system_prompt, max_tokens=max_tokens)
                     elif model_choice == CHOSEN_LOCAL_MODEL_TYPE:
@@ -1062,7 +1064,7 @@ def extract_topics(in_data_file: GradioFileData,
                     whole_conversation = list()
-                    responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, formatted_initial_table_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=initial_table_assistant_prefill)
                     topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_table_df, reference_df, new_topic_summary_df, batch_file_path_details, is_error =  write_llm_output_and_logs(response_text, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, model_name_map, group_name, produce_structures_summary_radio, first_run=True, output_folder=output_folder)
@@ -1271,6 +1273,7 @@ def wrapper_extract_topics_per_column_value(
     produce_structures_summary_radio: str = "No",
     aws_access_key_textbox:str="",
     aws_secret_key_textbox:str="",
     output_folder: str = OUTPUT_FOLDER,
     force_single_topic_prompt: str = force_single_topic_prompt,
     max_tokens: int = max_tokens,
@@ -1419,6 +1422,7 @@ def wrapper_extract_topics_per_column_value(
                 produce_structures_summary_radio=produce_structures_summary_radio,
                 aws_access_key_textbox=aws_access_key_textbox,
                 aws_secret_key_textbox=aws_secret_key_textbox,
                 max_tokens=max_tokens,
                 model_name_map=model_name_map,
                 max_time_for_loop=max_time_for_loop,

               produce_structures_summary_radio:str="No",
               aws_access_key_textbox:str='',
               aws_secret_key_textbox:str='',
+              hf_api_key_textbox:str='',
               max_tokens:int=max_tokens,
               model_name_map:dict=model_name_map,
               max_time_for_loop:int=max_time_for_loop,
     - force_single_topic_prompt (str, optional): The prompt for forcing the model to assign only one single topic to each response.
     - aws_access_key_textbox (str, optional): AWS access key for account with Bedrock permissions.
     - aws_secret_key_textbox (str, optional): AWS secret key for account with Bedrock permissions.
+    - hf_api_key_textbox (str, optional): Hugging Face API key for account with Hugging Face permissions.
     - max_tokens (int): The maximum number of tokens for the model.
     - model_name_map (dict, optional): A dictionary mapping full model name to shortened.
     - max_time_for_loop (int, optional): The number of seconds maximum that the function should run for before breaking (to run again, this is to avoid timeouts with some AWS services if deployed there).
             if (model_source == "Local") & (RUN_LOCAL_MODEL == "1"):
                 progress(0.1, f"Loading in local model: {CHOSEN_LOCAL_MODEL_TYPE}")
+                local_model, tokenizer = load_model(local_model_type=CHOSEN_LOCAL_MODEL_TYPE, repo_id=LOCAL_REPO_ID, model_filename=LOCAL_MODEL_FILE, model_dir=LOCAL_MODEL_FOLDER, hf_token=hf_api_key_textbox)
     if num_batches > 0:
         progress_measure = round(latest_batch_completed / num_batches, 1)
                         formatted_summary_prompt = structured_summary_prompt.format(response_table=normalised_simple_markdown_table,
                                                                                     topics=unique_topics_markdown)
+                    if model_source == "Local":
+                        #formatted_summary_prompt = llama_cpp_prefix + formatted_system_prompt + "\n" + formatted_summary_prompt + llama_cpp_suffix
+                        full_prompt = formatted_system_prompt + "\n" + formatted_summary_prompt
                     else:
                         full_prompt = formatted_system_prompt + "\n" + formatted_summary_prompt
                     whole_conversation = list()
                     # Process requests to large language model
+                    responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, formatted_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, tokenizer, bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=add_existing_topics_assistant_prefill,  master = True)
                     # Return output tables
                     topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, new_topic_df, new_reference_df, new_topic_summary_df, master_batch_out_file_part, is_error = write_llm_output_and_logs(response_text, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, model_name_map, group_name, produce_structures_summary_radio, first_run=False, output_folder=output_folder)
                     formatted_initial_table_system_prompt = initial_table_system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
                     # Prepare Gemini models before query
+                    if model_source == "Gemini":
                         print("Using Gemini model:", model_choice)
                         google_client, google_config = construct_gemini_generative_model(in_api_key=in_api_key, temperature=temperature, model_choice=model_choice, system_prompt=formatted_initial_table_system_prompt, max_tokens=max_tokens)
                     elif model_choice == CHOSEN_LOCAL_MODEL_TYPE:
                     whole_conversation = list()
+                    responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, formatted_initial_table_system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, tokenizer,bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=initial_table_assistant_prefill)
                     topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_table_df, reference_df, new_topic_summary_df, batch_file_path_details, is_error =  write_llm_output_and_logs(response_text, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_topic_summary_df, batch_size, chosen_cols, batch_basic_response_df, model_name_map, group_name, produce_structures_summary_radio, first_run=True, output_folder=output_folder)
     produce_structures_summary_radio: str = "No",
     aws_access_key_textbox:str="",
     aws_secret_key_textbox:str="",
+    hf_api_key_textbox:str="",
     output_folder: str = OUTPUT_FOLDER,
     force_single_topic_prompt: str = force_single_topic_prompt,
     max_tokens: int = max_tokens,
                 produce_structures_summary_radio=produce_structures_summary_radio,
                 aws_access_key_textbox=aws_access_key_textbox,
                 aws_secret_key_textbox=aws_secret_key_textbox,
+                hf_api_key_textbox=hf_api_key_textbox,
                 max_tokens=max_tokens,
                 model_name_map=model_name_map,
                 max_time_for_loop=max_time_for_loop,

tools/llm_funcs.py CHANGED Viewed

@@ -18,7 +18,7 @@ full_text = "" # Define dummy source text (full text) just to enable highlight f
 model = list() # Define empty list for model functions to run
 tokenizer = list() #[] # Define empty list for model functions to run
-from tools.config import AWS_REGION, LLM_TEMPERATURE, LLM_TOP_K, LLM_MIN_P, LLM_TOP_P, LLM_REPETITION_PENALTY, LLM_LAST_N_TOKENS, LLM_MAX_NEW_TOKENS, LLM_SEED, LLM_RESET, LLM_STREAM, LLM_THREADS, LLM_BATCH_SIZE, LLM_CONTEXT_LENGTH, LLM_SAMPLE, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, MAX_COMMENT_CHARS, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, HF_TOKEN, LLM_SEED, LLM_MAX_GPU_LAYERS, SPECULATIVE_DECODING, NUM_PRED_TOKENS
 from tools.prompts import initial_table_assistant_prefill
 if SPECULATIVE_DECODING == "True": SPECULATIVE_DECODING = True
@@ -192,7 +192,10 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
     torch_device:str=torch_device,
     repo_id=LOCAL_REPO_ID,
     model_filename=LOCAL_MODEL_FILE,
-    model_dir=LOCAL_MODEL_FOLDER):
     '''
     Load in a model from Hugging Face hub via the transformers package, or using llama_cpp_python by downloading a GGUF file from Huggingface Hub.
@@ -206,22 +209,22 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
         repo_id (str): The Hugging Face repository ID where the model is located.
         model_filename (str): The specific filename of the model to download from the repository.
         model_dir (str): The local directory where the model will be stored or downloaded.
     Returns:
         tuple: A tuple containing:
-            - llama_model (Llama): The loaded Llama.cpp model instance.
-            - tokenizer (list): An empty list (tokenizer is not used with Llama.cpp directly in this setup).
     '''
     print("Loading model ", local_model_type)
-    model_path = get_model_path(repo_id=repo_id, model_filename=model_filename, model_dir=model_dir)
     #print("model_path:", model_path)
     # Verify the device and cuda settings
     # Check if CUDA is enabled
-    import torch
-    from llama_cpp import Llama
-    from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
     torch.cuda.empty_cache()
     print("Is CUDA enabled? ", torch.cuda.is_available())
@@ -252,41 +255,132 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
         gpu_config.update_gpu(gpu_layers)
         gpu_config.update_context(max_context_length)
-        try:
-            print("GPU load variables:" , vars(gpu_config))
-            if speculative_decoding:
-                llama_model = Llama(model_path=model_path, type_k=8, type_v=8, flash_attn=True, draft_model=LlamaPromptLookupDecoding(num_pred_tokens=NUM_PRED_TOKENS), **vars(gpu_config))
-            else:
-                llama_model = Llama(model_path=model_path, type_k=8, type_v=8, flash_attn=True, **vars(gpu_config))
-        except Exception as e:
-            print("GPU load failed due to:", e, "Loading model in CPU mode")
-            # If fails, go to CPU mode
-            llama_model = Llama(model_path=model_path, **vars(cpu_config))
         print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU and a maximum context length of", gpu_config.n_ctx)
     # CPU mode
     else:
-        gpu_config.update_gpu(gpu_layers)
         cpu_config.update_gpu(gpu_layers)
         # Update context length according to slider
-        gpu_config.update_context(max_context_length)
         cpu_config.update_context(max_context_length)
         if speculative_decoding:
-            llama_model = Llama(model_path=model_path, draft_model=LlamaPromptLookupDecoding(num_pred_tokens=NUM_PRED_TOKENS), **vars(gpu_config))
         else:
-            llama_model = Llama(model_path=model_path, **vars(cpu_config))
-        print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU and a maximum context length of", gpu_config.n_ctx)
-    tokenizer = list()
     print("Finished loading model:", local_model_type)
     print("GPU layers assigned to cuda:", gpu_layers)
-    return llama_model, tokenizer
 def call_llama_cpp_model(formatted_string:str, gen_config:str, model=model):
     """
@@ -506,8 +600,83 @@ def call_aws_claude(prompt: str, system_prompt: str, temperature: float, max_tok
     return response
 # Function to send a request and update history
-def send_request(prompt: str, conversation_history: List[dict], google_client: ai.Client, config: types.GenerateContentConfig, model_choice: str, system_prompt: str, temperature: float, bedrock_runtime:boto3.Session.client, model_source:str, local_model= list(), assistant_prefill = "", progress=Progress(track_tqdm=True)) -> Tuple[str, List[dict]]:
     """
     This function sends a request to a language model with the given prompt, conversation history, model configuration, model choice, system prompt, and temperature.
     It constructs the full prompt by appending the new user prompt to the conversation history, generates a response from the model, and updates the conversation history with the new prompt and response.
@@ -516,6 +685,8 @@ def send_request(prompt: str, conversation_history: List[dict], google_client: a
     """
     # Constructing the full prompt from the conversation history
     full_prompt = "Conversation history:\n"
     for entry in conversation_history:
         role = entry['role'].capitalize()  # Assuming the history is stored with 'role' and 'parts'
@@ -573,13 +744,18 @@ def send_request(prompt: str, conversation_history: List[dict], google_client: a
                 gen_config = LlamaCPPGenerationConfig()
                 gen_config.update_temp(temperature)
-                response = call_llama_cpp_chatmodel(prompt, system_prompt, gen_config, model=local_model)
                 #print("Successful call to local model.")
                 break
             except Exception as e:
                 # If fails, try again after X seconds in case there is a throttle limit
-                print("Call to Gemma model failed:", e, " Waiting for ", str(timeout_wait), "seconds and trying again.")
                 time.sleep(timeout_wait)
@@ -596,21 +772,24 @@ def send_request(prompt: str, conversation_history: List[dict], google_client: a
     if isinstance(response, ResponseObject):
         response_text = response.text
         conversation_history.append({'role': 'assistant', 'parts': [response_text]})
-    elif 'choices' in response:
         if "gpt-oss" in model_choice:
             response_text = response['choices'][0]['message']['content'].split('<|start|>assistant<|channel|>final<|message|>')[1]
         else:
             response_text = response['choices'][0]['message']['content']
         response_text = response_text.strip()
         conversation_history.append({'role': 'assistant', 'parts': [response_text]}) #response['choices'][0]['text']]})
-    else:
         response_text = response.text
         response_text = response_text.strip()
         conversation_history.append({'role': 'assistant', 'parts': [response_text]})
-    return response, conversation_history, response_text
-def process_requests(prompts: List[str], system_prompt: str, conversation_history: List[dict], whole_conversation: List[str], whole_conversation_metadata: List[str], google_client: ai.Client, config: types.GenerateContentConfig, model_choice: str, temperature: float, bedrock_runtime:boto3.Session.client, model_source:str, batch_no:int = 1, local_model = list(), master:bool = False, assistant_prefill="") -> Tuple[List[ResponseObject], List[dict], List[str], List[str]]:
     """
     Processes a list of prompts by sending them to the model, appending the responses to the conversation history, and updating the whole conversation and metadata.
@@ -641,7 +820,7 @@ def process_requests(prompts: List[str], system_prompt: str, conversation_histor
     for prompt in prompts:
-        response, conversation_history, response_text = send_request(prompt, conversation_history, google_client=google_client, config=config, model_choice=model_choice, system_prompt=system_prompt, temperature=temperature, local_model=local_model, assistant_prefill=assistant_prefill, bedrock_runtime=bedrock_runtime, model_source=model_source)
         responses.append(response)
         whole_conversation.append(system_prompt)
@@ -677,9 +856,16 @@ def process_requests(prompts: List[str], system_prompt: str, conversation_histor
                     whole_conversation_metadata.append(str(response.usage_metadata))
                 elif "Local" in model_source:
-                    output_tokens = response['usage'].get('completion_tokens', 0)
-                    input_tokens = response['usage'].get('prompt_tokens', 0)
-                    whole_conversation_metadata.append(str(response['usage']))
             except KeyError as e:
                 print(f"Key error: {e} - Check the structure of response.usage_metadata")
         else:
@@ -699,6 +885,7 @@ def call_llm_with_markdown_table_checks(batch_prompts: List[str],
                                         temperature: float,
                                         reported_batch_no: int,
                                         local_model: object,
                                         bedrock_runtime:boto3.Session.client,
                                         model_source:str,
                                         MAX_OUTPUT_VALIDATION_ATTEMPTS: int,
@@ -721,6 +908,7 @@ def call_llm_with_markdown_table_checks(batch_prompts: List[str],
     - temperature (float): The temperature parameter for the model.
     - reported_batch_no (int): The reported batch number.
     - local_model (object): The local model to use.
     - bedrock_runtime (boto3.Session.client): The client object for boto3 Bedrock runtime.
     - model_source (str): The source of the model, whether in AWS, Gemini, or local.
     - MAX_OUTPUT_VALIDATION_ATTEMPTS (int): The maximum number of attempts to validate the output.
@@ -743,7 +931,7 @@ def call_llm_with_markdown_table_checks(batch_prompts: List[str],
         responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(
             batch_prompts, system_prompt, conversation_history, whole_conversation,
             whole_conversation_metadata, google_client, google_config, model_choice,
-            call_temperature, bedrock_runtime, model_source, reported_batch_no, local_model, master=master, assistant_prefill=assistant_prefill
         )
         stripped_response = response_text.strip()

 model = list() # Define empty list for model functions to run
 tokenizer = list() #[] # Define empty list for model functions to run
+from tools.config import AWS_REGION, LLM_TEMPERATURE, LLM_TOP_K, LLM_MIN_P, LLM_TOP_P, LLM_REPETITION_PENALTY, LLM_LAST_N_TOKENS, LLM_MAX_NEW_TOKENS, LLM_SEED, LLM_RESET, LLM_STREAM, LLM_THREADS, LLM_BATCH_SIZE, LLM_CONTEXT_LENGTH, LLM_SAMPLE, MAX_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, MAX_COMMENT_CHARS, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, HF_TOKEN, LLM_SEED, LLM_MAX_GPU_LAYERS, SPECULATIVE_DECODING, NUM_PRED_TOKENS, USE_LLAMA_CPP, COMPILE_MODE, MODEL_DTYPE, USE_BITSANDBYTES, COMPILE_TRANSFORMERS, OFFLOAD_TO_CPU
 from tools.prompts import initial_table_assistant_prefill
 if SPECULATIVE_DECODING == "True": SPECULATIVE_DECODING = True
     torch_device:str=torch_device,
     repo_id=LOCAL_REPO_ID,
     model_filename=LOCAL_MODEL_FILE,
+    model_dir=LOCAL_MODEL_FOLDER,
+    compile_mode=COMPILE_MODE,
+    model_dtype=MODEL_DTYPE,
+    hf_token=HF_TOKEN):
     '''
     Load in a model from Hugging Face hub via the transformers package, or using llama_cpp_python by downloading a GGUF file from Huggingface Hub.
         repo_id (str): The Hugging Face repository ID where the model is located.
         model_filename (str): The specific filename of the model to download from the repository.
         model_dir (str): The local directory where the model will be stored or downloaded.
+        compile_mode (str): The compilation mode to use for the model.
+        model_dtype (str): The data type to use for the model.
+        hf_token (str): The Hugging Face token to use for the model.
     Returns:
         tuple: A tuple containing:
+            - model (Llama/transformers model): The loaded Llama.cpp/transformers model instance.
+            - tokenizer (list/transformers tokenizer): An empty list (tokenizer is not used with Llama.cpp directly in this setup), or a transformers tokenizer.
     '''
     print("Loading model ", local_model_type)
+    tokenizer = list()
     #print("model_path:", model_path)
     # Verify the device and cuda settings
     # Check if CUDA is enabled
+    import torch
     torch.cuda.empty_cache()
     print("Is CUDA enabled? ", torch.cuda.is_available())
         gpu_config.update_gpu(gpu_layers)
         gpu_config.update_context(max_context_length)
+        if USE_LLAMA_CPP == "True":
+            from llama_cpp import Llama
+            from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
+            model_path = get_model_path(repo_id=repo_id, model_filename=model_filename, model_dir=model_dir)
+            try:
+                print("GPU load variables:" , vars(gpu_config))
+                if speculative_decoding:
+                    model = Llama(model_path=model_path, type_k=8, type_v=8, flash_attn=True, draft_model=LlamaPromptLookupDecoding(num_pred_tokens=NUM_PRED_TOKENS), **vars(gpu_config))
+                else:
+                    model = Llama(model_path=model_path, type_k=8, type_v=8, flash_attn=True, **vars(gpu_config))
+            except Exception as e:
+                print("GPU load failed due to:", e, "Loading model in CPU mode")
+                # If fails, go to CPU mode
+                model = Llama(model_path=model_path, **vars(cpu_config))
+        else:
+            from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+            print("Loading model from transformers")
+            # Use the official model ID for Gemma 3 4B
+            model_id = repo_id
+            # 1. Set Data Type (dtype)
+            # For H200/Hopper: 'bfloat16'
+            # For RTX 3060/Ampere: 'float16'
+            dtype_str = model_dtype #os.environ.get("MODEL_DTYPE", "bfloat16").lower()
+            if dtype_str == "bfloat16":
+                torch_dtype = torch.bfloat16
+            elif dtype_str == "float16":
+                torch_dtype = torch.float16
+            else:
+                torch_dtype = torch.float32 # A safe fallback
+            # 2. Set Compilation Mode
+            # 'max-autotune' is great for both but can be slow initially.
+            # 'reduce-overhead' is a faster alternative for compiling.
+            print(f"--- System Configuration ---")
+            print(f"Using model id: {model_id}")
+            print(f"Using dtype: {torch_dtype}")
+            print(f"Using compile mode: {compile_mode}")
+            print(f"Using bitsandbytes: {USE_BITSANDBYTES}")
+            print("--------------------------\n")
+            # --- Load Tokenizer and Model ---
+            # Load Tokenizer and Model
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+            if not tokenizer.pad_token:
+                tokenizer.pad_token = tokenizer.eos_token
+            if USE_BITSANDBYTES == "True":
+                if OFFLOAD_TO_CPU == "True":
+                    # This will be very slow. Requires at least 4GB of VRAM and 32GB of RAM
+                    print("Using bitsandbytes for quantisation to 8 bits, with offloading to CPU")
+                    max_memory={0: "4GB", "cpu": "32GB"}
+                    quantization_config = BitsAndBytesConfig(
+                    load_in_8bit=True,
+                    max_memory=max_memory,
+                    llm_int8_enable_fp32_cpu_offload=True # Note: if bitsandbytes has to offload to CPU, inference will be slow
+                    )
+                else:
+                    # For Gemma 4B, requires at least 6GB of VRAM
+                    print("Using bitsandbytes for quantisation to 4 bits")
+                    quantization_config = BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_quant_type="nf4", # Use the modern NF4 quantisation for better performance
+                    bnb_4bit_compute_dtype=torch_dtype,
+                    bnb_4bit_use_double_quant=True, # Optional: uses a second quantisation step to save even more memory
+                )
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_id,
+                    torch_dtype=torch_dtype,
+                    device_map="auto",
+                    quantization_config=quantization_config,
+                    token=hf_token
+                )
+            else:
+                print("Using fp16 precision for model")
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_id,
+                    torch_dtype=torch_dtype,
+                    device_map="auto",
+                    token=hf_token
+                )
+            # Compile the Model with the selected mode 🚀
+            if COMPILE_TRANSFORMERS == "True":
+                try:
+                    model = torch.compile(model, mode=compile_mode, fullgraph=True)
+                except Exception as e:
+                    print(f"Could not compile model: {e}. Running in eager mode.")
         print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU and a maximum context length of", gpu_config.n_ctx)
     # CPU mode
     else:
+        if USE_LLAMA_CPP == "False":
+            raise Warning("Using transformers model in CPU mode is not supported. Please change your config variable USE_LLAMA_CPP to True if you want to do CPU inference.")
+        model_path = get_model_path(repo_id=repo_id, model_filename=model_filename, model_dir=model_dir)
+        #gpu_config.update_gpu(gpu_layers)
         cpu_config.update_gpu(gpu_layers)
         # Update context length according to slider
+        #gpu_config.update_context(max_context_length)
         cpu_config.update_context(max_context_length)
         if speculative_decoding:
+            model = Llama(model_path=model_path, draft_model=LlamaPromptLookupDecoding(num_pred_tokens=NUM_PRED_TOKENS), **vars(cpu_config))
         else:
+            model = Llama(model_path=model_path, **vars(cpu_config))
+        print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU and a maximum context length of", cpu_config.n_ctx)
     print("Finished loading model:", local_model_type)
     print("GPU layers assigned to cuda:", gpu_layers)
+    return model, tokenizer
 def call_llama_cpp_model(formatted_string:str, gen_config:str, model=model):
     """
     return response
+def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCPPGenerationConfig, model=model, tokenizer=tokenizer):
+    """
+    This function sends a request to a transformers model with the given prompt, system prompt, and generation configuration.
+    """
+    # 1. Define the conversation as a list of dictionaries
+    conversation = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": prompt}
+    ]
+    # 2. Apply the chat template
+    # This function formats the conversation into the exact string Gemma 3 expects.
+    # add_generation_prompt=True adds the special tokens that tell the model it's its turn to speak.
+    input_ids = tokenizer.apply_chat_template(
+        conversation,
+        add_generation_prompt=True,
+        return_tensors="pt"
+    ).to("cuda")
+    # Warm-up run
+    print("Performing warm-up run...")
+    _ = model.generate(input_ids, max_new_tokens=50)
+    print("Warm-up complete.")
+    # Map LlamaCPP parameters to transformers parameters
+    generation_kwargs = {
+        'max_new_tokens': gen_config.max_tokens,
+        'temperature': gen_config.temperature,
+        'top_p': gen_config.top_p,
+        'top_k': gen_config.top_k,
+        'do_sample': True,
+        'pad_token_id': tokenizer.eos_token_id
+    }
+    # Remove parameters that don't exist in transformers
+    if hasattr(gen_config, 'repeat_penalty'):
+        generation_kwargs['repetition_penalty'] = gen_config.repeat_penalty
+    # --- Timed Inference Test ---
+    print("\nStarting timed inference test...")
+    start_time = time.time()
+    outputs = model.generate(
+        input_ids,
+        **generation_kwargs
+    )
+    end_time = time.time()
+    # --- Decode and Display Results ---
+    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # To get only the model's reply, we can decode just the newly generated tokens
+    new_tokens = outputs[0][input_ids.shape[-1]:]
+    assistant_reply = tokenizer.decode(new_tokens, skip_special_tokens=True)
+    num_input_tokens = len(input_ids)
+    num_generated_tokens = len(new_tokens)
+    duration = end_time - start_time
+    tokens_per_second = num_generated_tokens / duration
+    print("\n--- Inference Results ---")
+    print(f"System Prompt: {conversation[0]['content']}")
+    print(f"User Prompt: {conversation[1]['content']}")
+    print("---")
+    print(f"Assistant's Reply: {assistant_reply}")
+    print("\n--- Performance ---")
+    print(f"Time taken: {duration:.2f} seconds")
+    print(f"Generated tokens: {num_generated_tokens}")
+    print(f"Tokens per second: {tokens_per_second:.2f}")
+    return assistant_reply, num_input_tokens, num_generated_tokens
 # Function to send a request and update history
+def send_request(prompt: str, conversation_history: List[dict], google_client: ai.Client, config: types.GenerateContentConfig, model_choice: str, system_prompt: str, temperature: float, bedrock_runtime:boto3.Session.client, model_source:str, local_model= list(), tokenizer=tokenizer, assistant_prefill = "", progress=Progress(track_tqdm=True)) -> Tuple[str, List[dict]]:
     """
     This function sends a request to a language model with the given prompt, conversation history, model configuration, model choice, system prompt, and temperature.
     It constructs the full prompt by appending the new user prompt to the conversation history, generates a response from the model, and updates the conversation history with the new prompt and response.
     """
     # Constructing the full prompt from the conversation history
     full_prompt = "Conversation history:\n"
+    num_transformer_input_tokens = 0
+    num_transformer_generated_tokens = 0
     for entry in conversation_history:
         role = entry['role'].capitalize()  # Assuming the history is stored with 'role' and 'parts'
                 gen_config = LlamaCPPGenerationConfig()
                 gen_config.update_temp(temperature)
+                if USE_LLAMA_CPP == "True":
+                    response = call_llama_cpp_chatmodel(prompt, system_prompt, gen_config, model=local_model)
+                else:
+                    response, num_transformer_input_tokens, num_transformer_generated_tokens = call_transformers_model(prompt, system_prompt, gen_config, model=local_model, tokenizer=tokenizer)
+                    response_text = response
                 #print("Successful call to local model.")
                 break
             except Exception as e:
                 # If fails, try again after X seconds in case there is a throttle limit
+                print("Call to local model failed:", e, " Waiting for ", str(timeout_wait), "seconds and trying again.")
                 time.sleep(timeout_wait)
     if isinstance(response, ResponseObject):
         response_text = response.text
         conversation_history.append({'role': 'assistant', 'parts': [response_text]})
+    elif 'choices' in response: # LLama.cpp model response
         if "gpt-oss" in model_choice:
             response_text = response['choices'][0]['message']['content'].split('<|start|>assistant<|channel|>final<|message|>')[1]
         else:
             response_text = response['choices'][0]['message']['content']
         response_text = response_text.strip()
         conversation_history.append({'role': 'assistant', 'parts': [response_text]}) #response['choices'][0]['text']]})
+    elif model_source == "Gemini":
         response_text = response.text
         response_text = response_text.strip()
         conversation_history.append({'role': 'assistant', 'parts': [response_text]})
+    else: # Assume transformers model response
+        response_text = response
+        conversation_history.append({'role': 'assistant', 'parts': [response_text]})
+    return response, conversation_history, response_text, num_transformer_input_tokens, num_transformer_generated_tokens
+def process_requests(prompts: List[str], system_prompt: str, conversation_history: List[dict], whole_conversation: List[str], whole_conversation_metadata: List[str], google_client: ai.Client, config: types.GenerateContentConfig, model_choice: str, temperature: float, bedrock_runtime:boto3.Session.client, model_source:str, batch_no:int = 1, local_model = list(), tokenizer=tokenizer, master:bool = False, assistant_prefill="") -> Tuple[List[ResponseObject], List[dict], List[str], List[str]]:
     """
     Processes a list of prompts by sending them to the model, appending the responses to the conversation history, and updating the whole conversation and metadata.
     for prompt in prompts:
+        response, conversation_history, response_text, num_transformer_input_tokens, num_transformer_generated_tokens = send_request(prompt, conversation_history, google_client=google_client, config=config, model_choice=model_choice, system_prompt=system_prompt, temperature=temperature, local_model=local_model, tokenizer=tokenizer, assistant_prefill=assistant_prefill, bedrock_runtime=bedrock_runtime, model_source=model_source)
         responses.append(response)
         whole_conversation.append(system_prompt)
                     whole_conversation_metadata.append(str(response.usage_metadata))
                 elif "Local" in model_source:
+                    if USE_LLAMA_CPP == "True":
+                        output_tokens = response['usage'].get('completion_tokens', 0)
+                        input_tokens = response['usage'].get('prompt_tokens', 0)
+                        whole_conversation_metadata.append(str(response['usage']))
+                    if USE_LLAMA_CPP == "False":
+                        input_tokens = num_transformer_input_tokens
+                        output_tokens = num_transformer_generated_tokens
+                        whole_conversation_metadata.append('inputTokens: ' + str(input_tokens) + ' outputTokens: ' + str(output_tokens))
             except KeyError as e:
                 print(f"Key error: {e} - Check the structure of response.usage_metadata")
         else:
                                         temperature: float,
                                         reported_batch_no: int,
                                         local_model: object,
+                                        tokenizer:object,
                                         bedrock_runtime:boto3.Session.client,
                                         model_source:str,
                                         MAX_OUTPUT_VALIDATION_ATTEMPTS: int,
     - temperature (float): The temperature parameter for the model.
     - reported_batch_no (int): The reported batch number.
     - local_model (object): The local model to use.
+    - tokenizer (object): The tokenizer to use.
     - bedrock_runtime (boto3.Session.client): The client object for boto3 Bedrock runtime.
     - model_source (str): The source of the model, whether in AWS, Gemini, or local.
     - MAX_OUTPUT_VALIDATION_ATTEMPTS (int): The maximum number of attempts to validate the output.
         responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = process_requests(
             batch_prompts, system_prompt, conversation_history, whole_conversation,
             whole_conversation_metadata, google_client, google_config, model_choice,
+            call_temperature, bedrock_runtime, model_source, reported_batch_no, local_model, tokenizer=tokenizer, master=master, assistant_prefill=assistant_prefill
         )
         stripped_response = response_text.strip()

tools/verify_titles.py CHANGED Viewed

@@ -448,7 +448,7 @@ def verify_titles(in_data_file,
                     summary_whole_conversation = list()
                     # Process requests to large language model
-                    responses, summary_conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=add_existing_topics_assistant_prefill, master = True)
@@ -549,7 +549,7 @@ def verify_titles(in_data_file,
                     whole_conversation = [formatted_initial_table_system_prompt]
-                    responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=initial_table_assistant_prefill)
                     topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_table_df, markdown_table, reference_df, new_unique_topics_df, batch_file_path_details, is_error =  write_llm_output_and_logs_verify(response_text, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, model_name_map=model_name_map, first_run=True)

                     summary_whole_conversation = list()
                     # Process requests to large language model
+                    responses, summary_conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(summary_prompt_list, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=add_existing_topics_assistant_prefill, tokenizer=tokenizer, master = True)
                     whole_conversation = [formatted_initial_table_system_prompt]
+                    responses, conversation_history, whole_conversation, whole_conversation_metadata, response_text = call_llm_with_markdown_table_checks(batch_prompts, system_prompt, conversation_history, whole_conversation, whole_conversation_metadata, google_client, google_config, model_choice, temperature, reported_batch_no, local_model, bedrock_runtime, model_source, MAX_OUTPUT_VALIDATION_ATTEMPTS, assistant_prefill=initial_table_assistant_prefill, tokenizer=tokenizer)
                     topic_table_out_path, reference_table_out_path, unique_topics_df_out_path, topic_table_df, markdown_table, reference_df, new_unique_topics_df, batch_file_path_details, is_error =  write_llm_output_and_logs_verify(response_text, whole_conversation, whole_conversation_metadata, file_name, latest_batch_completed, start_row, end_row, model_choice_clean, temperature, log_files_output_paths, existing_reference_df, existing_unique_topics_df, batch_size, chosen_cols, model_name_map=model_name_map, first_run=True)