Spaces:

seanpedrickcase
/

llm_topic_modelling

Running on Zero

App Files Files Community

seanpedrickcase commited on Sep 19

Commit

bd1a015

1 Parent(s): bd19985

Generally improved inference for low vram systems, unsloth usage improvements, updated packages, switched default local model to Qwen 3 4b

Browse files

Files changed (12) hide show

README.md +1 -1
app.py +12 -11
tools/aws_functions.py +0 -3
tools/combine_sheets_into_xlsx.py +1 -2
tools/config.py +54 -17
tools/custom_csvlogger.py +2 -5
tools/dedup_summaries.py +38 -22
tools/llm_api_call.py +100 -26
tools/llm_funcs.py +37 -35
tools/prompts.py +8 -4
tools/verify_titles.py +11 -11
windows_install_llama-cpp-python.txt +4 -2

README.md CHANGED Viewed

@@ -97,7 +97,7 @@ The repo provides several requirements files that are relevant for different sit
 - **requirements_no_local**: Can be used to install the app without local model inference for a more lightweight installation.
 - **requirements_gpu.txt**: Used for Python 3.11 GPU-enabled environments. Uncomment the requirements under 'Windows' for Windows compatibility (CUDA 12.4).
-- **requirements_cpu.txt**: Used for Python 3.11 CPU-only environments. Uncomment the requirements under 'Windows' for Windows compatibility.
 - **requirements.txt**: Used for the Python 3.10 GPU-enabled environment on Hugging Face spaces (CUDA 12.4).
 2.  **Install packages from the requirements file:**

 - **requirements_no_local**: Can be used to install the app without local model inference for a more lightweight installation.
 - **requirements_gpu.txt**: Used for Python 3.11 GPU-enabled environments. Uncomment the requirements under 'Windows' for Windows compatibility (CUDA 12.4).
+- **requirements_cpu.txt**: Used for Python 3.11 CPU-only environments. Uncomment the requirements under 'Windows' for Windows compatibility. Make sure you have [Openblas](https://github.com/OpenMathLib/OpenBLAS) installed!
 - **requirements.txt**: Used for the Python 3.10 GPU-enabled environment on Hugging Face spaces (CUDA 12.4).
 2.  **Install packages from the requirements file:**

app.py CHANGED Viewed

@@ -51,12 +51,9 @@ if DYNAMODB_USAGE_LOG_HEADERS: DYNAMODB_USAGE_LOG_HEADERS = _get_env_list(DYNAMO
 today_rev = datetime.now().strftime("%Y%m%d")
-if RUN_LOCAL_MODEL == "1":
-    default_model_choice = CHOSEN_LOCAL_MODEL_TYPE
-elif RUN_AWS_FUNCTIONS == "1":
-    default_model_choice = "anthropic.claude-3-haiku-20240307-v1:0"
-else:
-    default_model_choice = "gemini-2.5-flash"
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Default(primary_hue="blue"), fill_width=True)
@@ -119,6 +116,7 @@ with app:
     summarised_references_markdown = gr.Markdown("", visible=False)
     summarised_outputs_list = gr.Dropdown(value= list(), choices= list(), visible=False, label="List of summarised outputs", allow_custom_value=True)
     latest_summary_completed_num = gr.Number(0, visible=False)
     summary_xlsx_output_files_list = gr.Dropdown(value= list(), choices= list(), visible=False, label="List of xlsx summary output files", allow_custom_value=True)
@@ -192,7 +190,7 @@ with app:
         extract_topics_btn = gr.Button("1. Extract topics", variant="secondary")
         with gr.Row(equal_height=True):
-            output_messages_textbox = gr.Textbox(value="", label="Output messages", scale=1, interactive=False)
             topic_extraction_output_files_xlsx = gr.File(label="Overall summary xlsx file", scale=1, interactive=False)
             topic_extraction_output_files = gr.File(label="Extract topics output files", scale=1, interactive=False)
@@ -410,7 +408,8 @@ with app:
                 hf_api_key_textbox,
                 azure_api_key_textbox,
                 output_folder_state,
-                logged_content_df],
         outputs=[display_topic_table_markdown,
                 master_topic_df_state,
                 master_unique_topics_df_state,
@@ -432,7 +431,8 @@ with app:
                 output_tokens_num,
                 number_of_calls_num,
                 output_messages_textbox,
-                logged_content_df],
                 api_name="extract_topics", show_progress_on=output_messages_textbox).\
                 success(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,  dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox_placeholder, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False, api_name="usage_logs").\
                 then(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_state, master_unique_topics_df_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state], outputs=[topic_extraction_output_files_xlsx, summary_xlsx_output_files_list])
@@ -518,7 +518,8 @@ with app:
                 log_files_output_list_state,
                 model_name_map_state,
                 usage_logs_state,
-                logged_content_df
             ],
             outputs=[
                 display_topic_table_markdown,
@@ -603,7 +604,7 @@ with app:
     success(fn=join_cols_onto_reference_df, inputs=[master_reference_df_state, file_data_state, join_colnames, reference_df_data_file_name_textbox], outputs=[master_reference_df_state_joined, out_join_files])
     # Export to xlsx file
-    export_xlsx_btn.click(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_state, master_unique_topics_df_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state], outputs=[out_xlsx_files], api_name="export_xlsx")
     # If relevant environment variable is set, load in the default cost code file from S3 or locally
     if GET_COST_CODES == "True" and (COST_CODES_PATH or S3_COST_CODES_PATH):

 today_rev = datetime.now().strftime("%Y%m%d")
+if RUN_LOCAL_MODEL == "1": default_model_choice = CHOSEN_LOCAL_MODEL_TYPE
+elif RUN_AWS_FUNCTIONS == "1": default_model_choice = "anthropic.claude-3-haiku-20240307-v1:0"
+else: default_model_choice = "gemini-2.5-flash"
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Default(primary_hue="blue"), fill_width=True)
     summarised_references_markdown = gr.Markdown("", visible=False)
     summarised_outputs_list = gr.Dropdown(value= list(), choices= list(), visible=False, label="List of summarised outputs", allow_custom_value=True)
     latest_summary_completed_num = gr.Number(0, visible=False)
+    add_existing_topics_summary_format_textbox = gr.Textbox(value="", visible=False, label="Add existing topics summary format")
     summary_xlsx_output_files_list = gr.Dropdown(value= list(), choices= list(), visible=False, label="List of xlsx summary output files", allow_custom_value=True)
         extract_topics_btn = gr.Button("1. Extract topics", variant="secondary")
         with gr.Row(equal_height=True):
+            output_messages_textbox = gr.Textbox(value="", label="Output messages", scale=1, interactive=False, lines=4)
             topic_extraction_output_files_xlsx = gr.File(label="Overall summary xlsx file", scale=1, interactive=False)
             topic_extraction_output_files = gr.File(label="Extract topics output files", scale=1, interactive=False)
                 hf_api_key_textbox,
                 azure_api_key_textbox,
                 output_folder_state,
+                logged_content_df,
+                add_existing_topics_summary_format_textbox],
         outputs=[display_topic_table_markdown,
                 master_topic_df_state,
                 master_unique_topics_df_state,
                 output_tokens_num,
                 number_of_calls_num,
                 output_messages_textbox,
+                logged_content_df,
+                add_existing_topics_summary_format_textbox],
                 api_name="extract_topics", show_progress_on=output_messages_textbox).\
                 success(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB,  dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, original_data_file_name_textbox, in_colnames, model_choice, conversation_metadata_textbox_placeholder, input_tokens_num, output_tokens_num, number_of_calls_num, estimated_time_taken_number, cost_code_choice_drop], None, preprocess=False, api_name="usage_logs").\
                 then(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_state, master_unique_topics_df_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state], outputs=[topic_extraction_output_files_xlsx, summary_xlsx_output_files_list])
                 log_files_output_list_state,
                 model_name_map_state,
                 usage_logs_state,
+                logged_content_df,
+                add_existing_topics_summary_format_textbox
             ],
             outputs=[
                 display_topic_table_markdown,
     success(fn=join_cols_onto_reference_df, inputs=[master_reference_df_state, file_data_state, join_colnames, reference_df_data_file_name_textbox], outputs=[master_reference_df_state_joined, out_join_files])
     # Export to xlsx file
+    export_xlsx_btn.click(collect_output_csvs_and_create_excel_output, inputs=[in_data_files, in_colnames, original_data_file_name_textbox, in_group_col, model_choice, master_reference_df_state, master_unique_topics_df_state, summarised_output_df, missing_df_state, in_excel_sheets, usage_logs_state, model_name_map_state, output_folder_state], outputs=[out_xlsx_files, summary_xlsx_output_files_list], api_name="export_xlsx")
     # If relevant environment variable is set, load in the default cost code file from S3 or locally
     if GET_COST_CODES == "True" and (COST_CODES_PATH or S3_COST_CODES_PATH):

tools/aws_functions.py CHANGED Viewed

@@ -15,9 +15,6 @@ def connect_to_bedrock_runtime(model_name_map:dict, model_choice:str, aws_access
         if RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
             print("Connecting to Bedrock via existing SSO connection")
             bedrock_runtime = boto3.client('bedrock-runtime', region_name=AWS_REGION)
-        elif RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
-            print("Connecting to Bedrock via existing SSO connection")
-            bedrock_runtime = boto3.client('bedrock-runtime', region_name=AWS_REGION)
         elif aws_access_key_textbox and aws_secret_key_textbox:
             print("Connecting to Bedrock using AWS access key and secret keys from user input.")
             bedrock_runtime = boto3.client('bedrock-runtime',

         if RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
             print("Connecting to Bedrock via existing SSO connection")
             bedrock_runtime = boto3.client('bedrock-runtime', region_name=AWS_REGION)
         elif aws_access_key_textbox and aws_secret_key_textbox:
             print("Connecting to Bedrock using AWS access key and secret keys from user input.")
             bedrock_runtime = boto3.client('bedrock-runtime',

tools/combine_sheets_into_xlsx.py CHANGED Viewed

@@ -380,8 +380,7 @@ def collect_output_csvs_and_create_excel_output(in_data_files:List, chosen_cols:
     xlsx_output_filenames = [xlsx_output_filename]
     # Delete intermediate csv files
-    for csv_file in new_csv_files:
-        os.remove(csv_file)
     return xlsx_output_filenames, xlsx_output_filenames

     xlsx_output_filenames = [xlsx_output_filename]
     # Delete intermediate csv files
+    for csv_file in new_csv_files: os.remove(csv_file)
     return xlsx_output_filenames, xlsx_output_filenames

tools/config.py CHANGED Viewed

@@ -190,7 +190,7 @@ if LOGGING == 'True':
 ###
 # App run variables
 ###
-OUTPUT_DEBUG_FILES = get_or_create_env_var('OUTPUT_DEBUG_FILES', 'False') # Whether to output debug files
 TIMEOUT_WAIT = int(get_or_create_env_var('TIMEOUT_WAIT', '30')) # Maximum number of seconds to wait for a response from the LLM
 NUMBER_OF_RETRY_ATTEMPTS = int(get_or_create_env_var('NUMBER_OF_RETRY_ATTEMPTS', '5')) # Maximum number of times to retry a request to the LLM
@@ -229,7 +229,7 @@ model_full_names = list()
 model_short_names = list()
 model_source = list()
-CHOSEN_LOCAL_MODEL_TYPE = get_or_create_env_var("CHOSEN_LOCAL_MODEL_TYPE", "Gemma 3 4B") # Gemma 3 1B #  "Gemma 2b" # "Gemma 3 4B"
 if RUN_LOCAL_MODEL == "1" and CHOSEN_LOCAL_MODEL_TYPE:
     model_full_names.append(CHOSEN_LOCAL_MODEL_TYPE)
@@ -264,8 +264,21 @@ model_name_map = {
 HF_TOKEN = get_or_create_env_var('HF_TOKEN', '')
 LOAD_LOCAL_MODEL_AT_START = get_or_create_env_var('LOAD_LOCAL_MODEL_AT_START', 'True')
-USE_LLAMA_CPP = get_or_create_env_var('USE_LLAMA_CPP', 'True') # Llama.cpp or transformers
 GEMMA2_REPO_ID = get_or_create_env_var("GEMMA2_2B_REPO_ID", "unsloth/gemma-2-it-GGUF")
 GEMMA2_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA2_2B_REPO_TRANSFORMERS_ID", "unsloth/gemma-2-2b-it-bnb-4bit")
@@ -293,18 +306,31 @@ GEMMA3_4B_MODEL_FOLDER = get_or_create_env_var("GEMMA3_4B_MODEL_FOLDER", "model/
 GPT_OSS_REPO_ID = get_or_create_env_var("GPT_OSS_REPO_ID", "unsloth/gpt-oss-20b-GGUF")
 GPT_OSS_REPO_TRANSFORMERS_ID = get_or_create_env_var("GPT_OSS_REPO_TRANSFORMERS_ID", "unsloth/gpt-oss-20b-unsloth-bnb-4bit")
-if USE_LLAMA_CPP == "False":
-    GPT_OSS_REPO_ID = GPT_OSS_REPO_TRANSFORMERS_ID
 GPT_OSS_MODEL_FILE = get_or_create_env_var("GPT_OSS_MODEL_FILE", "gpt-oss-20b-F16.gguf")
 GPT_OSS_MODEL_FOLDER = get_or_create_env_var("GPT_OSS_MODEL_FOLDER", "model/gpt_oss")
 USE_SPECULATIVE_DECODING = get_or_create_env_var("USE_SPECULATIVE_DECODING", "False")
-ASSISTANT_MODEL = get_or_create_env_var("ASSISTANT_MODEL", "unsloth/gemma-3-270m-it")
-GEMMA3_DRAFT_MODEL_LOC = get_or_create_env_var("GEMMA3_DRAFT_MODEL_LOC", ".cache/llama.cpp/unsloth_gemma-3-270m-it-qat-GGUF_gemma-3-270m-it-qat-F16.gguf")
-GEMMA3_4B_DRAFT_MODEL_LOC = get_or_create_env_var("GEMMA3_4B_DRAFT_MODEL_LOC", ".cache/llama.cpp/unsloth_gemma-3-4b-it-qat-GGUF_gemma-3-4b-it-qat-Q4_K_M.gguf")
 if CHOSEN_LOCAL_MODEL_TYPE == "Gemma 2b":
     LOCAL_REPO_ID = GEMMA2_REPO_ID
@@ -322,34 +348,45 @@ elif CHOSEN_LOCAL_MODEL_TYPE == "Gemma 3 4B":
     LOCAL_MODEL_FILE = GEMMA3_4B_MODEL_FILE
     LOCAL_MODEL_FOLDER = GEMMA3_4B_MODEL_FOLDER
 elif CHOSEN_LOCAL_MODEL_TYPE == "gpt-oss-20b":
     LOCAL_REPO_ID = GPT_OSS_REPO_ID
     LOCAL_MODEL_FILE = GPT_OSS_MODEL_FILE
     LOCAL_MODEL_FOLDER = GPT_OSS_MODEL_FOLDER
 LLM_MAX_GPU_LAYERS = int(get_or_create_env_var('LLM_MAX_GPU_LAYERS','-1')) # Maximum possible
-LLM_TEMPERATURE = float(get_or_create_env_var('LLM_TEMPERATURE', '0.1'))
 LLM_TOP_K = int(get_or_create_env_var('LLM_TOP_K','64')) # https://docs.unsloth.ai/basics/gemma-3-how-to-run-and-fine-tune
 LLM_MIN_P = float(get_or_create_env_var('LLM_MIN_P', '0'))
 LLM_TOP_P = float(get_or_create_env_var('LLM_TOP_P', '0.95'))
 LLM_REPETITION_PENALTY = float(get_or_create_env_var('LLM_REPETITION_PENALTY', '1.0'))
 LLM_LAST_N_TOKENS = int(get_or_create_env_var('LLM_LAST_N_TOKENS', '512'))
-LLM_MAX_NEW_TOKENS = int(get_or_create_env_var('LLM_MAX_NEW_TOKENS', '4096'))
 LLM_SEED = int(get_or_create_env_var('LLM_SEED', '42'))
 LLM_RESET = get_or_create_env_var('LLM_RESET', 'True')
 LLM_STREAM = get_or_create_env_var('LLM_STREAM', 'True')
 LLM_THREADS = int(get_or_create_env_var('LLM_THREADS', '-1'))
-LLM_BATCH_SIZE = int(get_or_create_env_var('LLM_BATCH_SIZE', '128'))
-LLM_CONTEXT_LENGTH = int(get_or_create_env_var('LLM_CONTEXT_LENGTH', '16384'))
 LLM_SAMPLE = get_or_create_env_var('LLM_SAMPLE', 'True')
-LLM_STOP_STRINGS = get_or_create_env_var('LLM_STOP_STRINGS', r"['\n\n\n\n']")
 SPECULATIVE_DECODING = get_or_create_env_var('SPECULATIVE_DECODING', 'False')
 NUM_PRED_TOKENS = int(get_or_create_env_var('NUM_PRED_TOKENS', '2'))
-if CHOSEN_LOCAL_MODEL_TYPE == "gpt-oss-20b":
-    REASONING_SUFFIX = get_or_create_env_var('REASONING_SUFFIX', 'Reasoning: low')
-else:
-    REASONING_SUFFIX = get_or_create_env_var('REASONING_SUFFIX', '')  # If you are using e.g. gpt-oss, you can add a reasoning suffix to set reasoning level
 # Transformers variables
 COMPILE_TRANSFORMERS = get_or_create_env_var('COMPILE_TRANSFORMERS', 'False') # Whether to compile transformers models

 ###
 # App run variables
 ###
+OUTPUT_DEBUG_FILES = get_or_create_env_var('OUTPUT_DEBUG_FILES', 'True') # Whether to output debug files
 TIMEOUT_WAIT = int(get_or_create_env_var('TIMEOUT_WAIT', '30')) # Maximum number of seconds to wait for a response from the LLM
 NUMBER_OF_RETRY_ATTEMPTS = int(get_or_create_env_var('NUMBER_OF_RETRY_ATTEMPTS', '5')) # Maximum number of times to retry a request to the LLM
 model_short_names = list()
 model_source = list()
+CHOSEN_LOCAL_MODEL_TYPE = get_or_create_env_var("CHOSEN_LOCAL_MODEL_TYPE", "Qwen 3 4B") # Gemma 3 1B #  "Gemma 2b" # "Gemma 3 4B"
 if RUN_LOCAL_MODEL == "1" and CHOSEN_LOCAL_MODEL_TYPE:
     model_full_names.append(CHOSEN_LOCAL_MODEL_TYPE)
 HF_TOKEN = get_or_create_env_var('HF_TOKEN', '')
 LOAD_LOCAL_MODEL_AT_START = get_or_create_env_var('LOAD_LOCAL_MODEL_AT_START', 'True')
+# If you are using a system with low VRAM, you can set this to True to reduce the memory requirements
+LOW_VRAM_SYSTEM = get_or_create_env_var('LOW_VRAM_SYSTEM', 'False')
+if LOW_VRAM_SYSTEM == 'True':
+    print("Changing settings for low VRAM system")
+    USE_LLAMA_CPP = get_or_create_env_var('USE_LLAMA_CPP', 'True')
+    LLM_MAX_NEW_TOKENS = int(get_or_create_env_var('LLM_MAX_NEW_TOKENS', '4096'))
+    LLM_CONTEXT_LENGTH = int(get_or_create_env_var('LLM_CONTEXT_LENGTH', '8192'))
+    LLM_BATCH_SIZE = int(get_or_create_env_var('LLM_BATCH_SIZE', '512'))
+    KV_QUANT_LEVEL = int(get_or_create_env_var('KV_QUANT_LEVEL', '2')) # 2 is equivalent to q4_0, 8 is q8_0
+USE_LLAMA_CPP = get_or_create_env_var('USE_LLAMA_CPP', 'True') # Llama.cpp or transformers with unsloth
 GEMMA2_REPO_ID = get_or_create_env_var("GEMMA2_2B_REPO_ID", "unsloth/gemma-2-it-GGUF")
 GEMMA2_REPO_TRANSFORMERS_ID = get_or_create_env_var("GEMMA2_2B_REPO_TRANSFORMERS_ID", "unsloth/gemma-2-2b-it-bnb-4bit")
 GPT_OSS_REPO_ID = get_or_create_env_var("GPT_OSS_REPO_ID", "unsloth/gpt-oss-20b-GGUF")
 GPT_OSS_REPO_TRANSFORMERS_ID = get_or_create_env_var("GPT_OSS_REPO_TRANSFORMERS_ID", "unsloth/gpt-oss-20b-unsloth-bnb-4bit")
+if USE_LLAMA_CPP == "False": GPT_OSS_REPO_ID = GPT_OSS_REPO_TRANSFORMERS_ID
 GPT_OSS_MODEL_FILE = get_or_create_env_var("GPT_OSS_MODEL_FILE", "gpt-oss-20b-F16.gguf")
 GPT_OSS_MODEL_FOLDER = get_or_create_env_var("GPT_OSS_MODEL_FOLDER", "model/gpt_oss")
 USE_SPECULATIVE_DECODING = get_or_create_env_var("USE_SPECULATIVE_DECODING", "False")
+if CHOSEN_LOCAL_MODEL_TYPE == "Gemma 3 4B": ASSISTANT_MODEL = get_or_create_env_var("ASSISTANT_MODEL", "unsloth/gemma-3-270m-it")
+elif CHOSEN_LOCAL_MODEL_TYPE == "Qwen 3 4B": ASSISTANT_MODEL = get_or_create_env_var("ASSISTANT_MODEL", "unsloth/Qwen3-0.6B")
+DRAFT_MODEL_LOC = get_or_create_env_var("DRAFT_MODEL_LOC", ".cache/llama.cpp/")
+GEMMA3_DRAFT_MODEL_LOC = get_or_create_env_var("GEMMA3_DRAFT_MODEL_LOC", DRAFT_MODEL_LOC + "unsloth_gemma-3-270m-it-qat-GGUF_gemma-3-270m-it-qat-F16.gguf")
+GEMMA3_4B_DRAFT_MODEL_LOC = get_or_create_env_var("GEMMA3_4B_DRAFT_MODEL_LOC", DRAFT_MODEL_LOC + "unsloth_gemma-3-4b-it-qat-GGUF_gemma-3-4b-it-qat-Q4_K_M.gguf")
+QWEN3_4B_REPO_ID = get_or_create_env_var("QWEN3_4B_REPO_ID", "unsloth/Qwen3-4B-Instruct-2507-GGUF")
+QWEN3_4B_REPO_TRANSFORMERS_ID = get_or_create_env_var("QWEN3_4B_REPO_TRANSFORMERS_ID", "unsloth/Qwen3-4B-unsloth-bnb-4bit")
+if USE_LLAMA_CPP == "False": QWEN3_4B_REPO_ID = QWEN3_4B_REPO_TRANSFORMERS_ID
+QWEN3_4B_MODEL_FILE = get_or_create_env_var("QWEN3_4B_MODEL_FILE", "Qwen3-4B-Instruct-2507-Q4_K_M.gguf")
+QWEN3_4B_MODEL_FOLDER = get_or_create_env_var("QWEN3_4B_MODEL_FOLDER", "model/qwen")
+QWEN3_DRAFT_MODEL_LOC = get_or_create_env_var("QWEN3_DRAFT_MODEL_LOC", DRAFT_MODEL_LOC + "Qwen3-0.6B-Q8_0.gguf")
+QWEN3_4B_DRAFT_MODEL_LOC = get_or_create_env_var("QWEN3_4B_DRAFT_MODEL_LOC", DRAFT_MODEL_LOC + "Qwen3-4B-Instruct-2507-Q4_K_M.gguf")
 if CHOSEN_LOCAL_MODEL_TYPE == "Gemma 2b":
     LOCAL_REPO_ID = GEMMA2_REPO_ID
     LOCAL_MODEL_FILE = GEMMA3_4B_MODEL_FILE
     LOCAL_MODEL_FOLDER = GEMMA3_4B_MODEL_FOLDER
+elif CHOSEN_LOCAL_MODEL_TYPE == "Qwen 3 4B":
+    LOCAL_REPO_ID = QWEN3_4B_REPO_ID
+    LOCAL_MODEL_FILE = QWEN3_4B_MODEL_FILE
+    LOCAL_MODEL_FOLDER = QWEN3_4B_MODEL_FOLDER
 elif CHOSEN_LOCAL_MODEL_TYPE == "gpt-oss-20b":
     LOCAL_REPO_ID = GPT_OSS_REPO_ID
     LOCAL_MODEL_FILE = GPT_OSS_MODEL_FILE
     LOCAL_MODEL_FOLDER = GPT_OSS_MODEL_FOLDER
 LLM_MAX_GPU_LAYERS = int(get_or_create_env_var('LLM_MAX_GPU_LAYERS','-1')) # Maximum possible
+LLM_TEMPERATURE = float(get_or_create_env_var('LLM_TEMPERATURE', '0.6'))
 LLM_TOP_K = int(get_or_create_env_var('LLM_TOP_K','64')) # https://docs.unsloth.ai/basics/gemma-3-how-to-run-and-fine-tune
 LLM_MIN_P = float(get_or_create_env_var('LLM_MIN_P', '0'))
 LLM_TOP_P = float(get_or_create_env_var('LLM_TOP_P', '0.95'))
 LLM_REPETITION_PENALTY = float(get_or_create_env_var('LLM_REPETITION_PENALTY', '1.0'))
 LLM_LAST_N_TOKENS = int(get_or_create_env_var('LLM_LAST_N_TOKENS', '512'))
+LLM_MAX_NEW_TOKENS = int(get_or_create_env_var('LLM_MAX_NEW_TOKENS', '8192'))
 LLM_SEED = int(get_or_create_env_var('LLM_SEED', '42'))
 LLM_RESET = get_or_create_env_var('LLM_RESET', 'True')
 LLM_STREAM = get_or_create_env_var('LLM_STREAM', 'True')
 LLM_THREADS = int(get_or_create_env_var('LLM_THREADS', '-1'))
+LLM_BATCH_SIZE = int(get_or_create_env_var('LLM_BATCH_SIZE', '512'))
+LLM_CONTEXT_LENGTH = int(get_or_create_env_var('LLM_CONTEXT_LENGTH', '32768'))
 LLM_SAMPLE = get_or_create_env_var('LLM_SAMPLE', 'True')
+LLM_STOP_STRINGS = get_or_create_env_var('LLM_STOP_STRINGS', r"['                                          ','\n\n\n\n','---------------------------------------------]")
+MULTIMODAL_PROMPT_FORMAT = get_or_create_env_var('MULTIMODAL_PROMPT_FORMAT', 'False')
 SPECULATIVE_DECODING = get_or_create_env_var('SPECULATIVE_DECODING', 'False')
 NUM_PRED_TOKENS = int(get_or_create_env_var('NUM_PRED_TOKENS', '2'))
+KV_QUANT_LEVEL = int(get_or_create_env_var('KV_QUANT_LEVEL', '16'))
+# If you are using e.g. gpt-oss, you can add a reasoning suffix to set reasoning level, or turn it off in the case of Qwen 3 4B
+if CHOSEN_LOCAL_MODEL_TYPE == "gpt-oss-20b": REASONING_SUFFIX = get_or_create_env_var('REASONING_SUFFIX', 'Reasoning: low')
+elif CHOSEN_LOCAL_MODEL_TYPE == "Qwen 3 4B" and USE_LLAMA_CPP == "False": REASONING_SUFFIX = get_or_create_env_var('REASONING_SUFFIX', '/nothink')
+else: REASONING_SUFFIX = get_or_create_env_var('REASONING_SUFFIX', '')
 # Transformers variables
 COMPILE_TRANSFORMERS = get_or_create_env_var('COMPILE_TRANSFORMERS', 'False') # Whether to compile transformers models

tools/custom_csvlogger.py CHANGED Viewed

@@ -14,8 +14,7 @@ from multiprocessing import Lock
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 from gradio_client import utils as client_utils
-import gradio as gr
-from gradio import utils, wasm_utils
 from tools.config import AWS_REGION, AWS_ACCESS_KEY, AWS_SECRET_KEY, RUN_AWS_FUNCTIONS
@@ -56,9 +55,7 @@ class CSVLogger_custom(FlaggingCallback):
         self.simplify_file_data = simplify_file_data
         self.verbose = verbose
         self.dataset_file_name = dataset_file_name
-        self.lock = (
-            Lock() if not wasm_utils.IS_WASM else contextlib.nullcontext()
-        )  # The multiprocessing module doesn't work on Lite.
     def setup(
         self,

 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 from gradio_client import utils as client_utils
+from gradio import utils
 from tools.config import AWS_REGION, AWS_ACCESS_KEY, AWS_SECRET_KEY, RUN_AWS_FUNCTIONS
         self.simplify_file_data = simplify_file_data
         self.verbose = verbose
         self.dataset_file_name = dataset_file_name
+        self.lock = Lock()
     def setup(
         self,

tools/dedup_summaries.py CHANGED Viewed

@@ -161,8 +161,6 @@ def deduplicate_topics(reference_df:pd.DataFrame,
         reference_file_out_path = output_folder + reference_table_file_name
         unique_topics_file_out_path = output_folder + unique_topics_table_file_name
-        #reference_df.to_csv(reference_file_out_path, index = None, encoding='utf-8-sig')
-        #topic_summary_df.to_csv(unique_topics_file_out_path, index=None, encoding='utf-8-sig')
         output_files.append(reference_file_out_path)
         output_files.append(unique_topics_file_out_path)
@@ -195,13 +193,17 @@ def deduplicate_topics(reference_df:pd.DataFrame,
         if "Group" not in reference_df.columns:
             reference_df["Group"] = "All"
         for i in range(0, 8):
-            if merge_sentiment == "No":
                 if merge_general_topics == "No":
                     reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
                     reference_df_unique = reference_df.drop_duplicates("old_category")
-                    deduplicated_topic_map_df = reference_df_unique.groupby(["General topic", "Sentiment", "Group"]).apply(
-                        lambda group: deduplicate_categories(
                             group["Subtopic"],
                             group["Sentiment"],
                             reference_df,
@@ -209,30 +211,38 @@ def deduplicate_topics(reference_df:pd.DataFrame,
                             merge_general_topics="No",
                             threshold=score_threshold
                         )
-                    ).reset_index(drop=True)
                 else:
                     # This case should allow cross-topic matching but is still grouping by Sentiment
                     reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
                     reference_df_unique = reference_df.drop_duplicates("old_category")
-                    deduplicated_topic_map_df = reference_df_unique.groupby("Sentiment").apply(
-                        lambda group: deduplicate_categories(
                             group["Subtopic"],
                             group["Sentiment"],
                             reference_df,
-                            general_topic_series=None,  # Set to None to allow cross-topic matching
                             merge_general_topics="Yes",
                             threshold=score_threshold
                         )
-                    ).reset_index(drop=True)
             else:
                 if merge_general_topics == "No":
-                    # Update this case to maintain general topic boundaries
                     reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
                     reference_df_unique = reference_df.drop_duplicates("old_category")
-                    deduplicated_topic_map_df = reference_df_unique.groupby("General topic").apply(
-                        lambda group: deduplicate_categories(
                             group["Subtopic"],
                             group["Sentiment"],
                             reference_df,
@@ -241,9 +251,10 @@ def deduplicate_topics(reference_df:pd.DataFrame,
                             merge_sentiment=merge_sentiment,
                             threshold=score_threshold
                         )
-                    ).reset_index(drop=True)
-                else:
-                    # For complete merging across all categories
                     reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
                     reference_df_unique = reference_df.drop_duplicates("old_category")
@@ -251,14 +262,13 @@ def deduplicate_topics(reference_df:pd.DataFrame,
                         reference_df_unique["Subtopic"],
                         reference_df_unique["Sentiment"],
                         reference_df,
-                        general_topic_series=None,  # Set to None to allow cross-topic matching
                         merge_general_topics="Yes",
                         merge_sentiment=merge_sentiment,
                         threshold=score_threshold
                     ).reset_index(drop=True)
             if deduplicated_topic_map_df['deduplicated_category'].isnull().all():
-            # Check if 'deduplicated_category' contains any values
                 print("No deduplicated categories found, skipping the following code.")
             else:
@@ -785,6 +795,9 @@ def summarise_output_topics(sampled_reference_table_df:pd.DataFrame,
             for prompt, summary, metadata, batch, model_choice, validated, group, task_type, file_name in zip(all_prompts_content, all_summaries_content, all_metadata_content, all_batches_content, all_model_choice_content, all_validated_content, all_groups_content, all_task_type_content, all_file_names_content)
         ]
         out_logged_content = existing_logged_content + all_logged_content
         ### Save output files
@@ -1004,7 +1017,7 @@ def overall_summary(topic_summary_df:pd.DataFrame,
         # Write overall outputs to csv
         overall_summary_output_csv_path = output_folder + batch_file_path_details + "_overall_summary_" + model_choice_clean_short + ".csv"
         summarised_outputs_df = pd.DataFrame(data={"Group":unique_groups, "Summary":summarised_outputs_for_df})
-        summarised_outputs_df.to_csv(overall_summary_output_csv_path, index=None)
         output_files.append(overall_summary_output_csv_path)
         summarised_outputs_df_for_display = pd.DataFrame(data={"Group":unique_groups, "Summary":summarised_outputs})
@@ -1031,6 +1044,9 @@ def overall_summary(topic_summary_df:pd.DataFrame,
             for prompt, summary, metadata, batch, model_choice, validated, group, task_type, file_name in zip(all_prompts_content, all_summaries_content, all_metadata_content, all_batches_content, all_model_choice_content, all_validated_content, all_groups_content, all_task_type_content, all_file_names_content)
         ]
         out_logged_content = existing_logged_content + all_logged_content
     return output_files, html_output_table, summarised_outputs_df, out_metadata_str, input_tokens_num, output_tokens_num, number_of_calls_num, time_taken, out_message, out_logged_content

         reference_file_out_path = output_folder + reference_table_file_name
         unique_topics_file_out_path = output_folder + unique_topics_table_file_name
         output_files.append(reference_file_out_path)
         output_files.append(unique_topics_file_out_path)
         if "Group" not in reference_df.columns:
             reference_df["Group"] = "All"
         for i in range(0, 8):
+            if merge_sentiment == "No":
                 if merge_general_topics == "No":
                     reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
                     reference_df_unique = reference_df.drop_duplicates("old_category")
+                    # Create an empty list to store results from each group
+                    results = []
+                    # Iterate over each group instead of using .apply()
+                    for name, group in reference_df_unique.groupby(["General topic", "Sentiment", "Group"]):
+                        # Run your function on the 'group' DataFrame
+                        result = deduplicate_categories(
                             group["Subtopic"],
                             group["Sentiment"],
                             reference_df,
                             merge_general_topics="No",
                             threshold=score_threshold
                         )
+                        results.append(result)
+                    # Concatenate all the results into a single DataFrame
+                    deduplicated_topic_map_df = pd.concat(results).reset_index(drop=True)
+                    # --- MODIFIED SECTION END ---
                 else:
                     # This case should allow cross-topic matching but is still grouping by Sentiment
                     reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
                     reference_df_unique = reference_df.drop_duplicates("old_category")
+                    results = []
+                    for name, group in reference_df_unique.groupby("Sentiment"):
+                        result = deduplicate_categories(
                             group["Subtopic"],
                             group["Sentiment"],
                             reference_df,
+                            general_topic_series=None,
                             merge_general_topics="Yes",
                             threshold=score_threshold
                         )
+                        results.append(result)
+                    deduplicated_topic_map_df = pd.concat(results).reset_index(drop=True)
             else:
                 if merge_general_topics == "No":
                     reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
                     reference_df_unique = reference_df.drop_duplicates("old_category")
+                    results = []
+                    for name, group in reference_df_unique.groupby("General topic"):
+                        result = deduplicate_categories(
                             group["Subtopic"],
                             group["Sentiment"],
                             reference_df,
                             merge_sentiment=merge_sentiment,
                             threshold=score_threshold
                         )
+                        results.append(result)
+                    deduplicated_topic_map_df = pd.concat(results).reset_index(drop=True)
+                else:
                     reference_df["old_category"] = reference_df["Subtopic"] + " | " + reference_df["Sentiment"]
                     reference_df_unique = reference_df.drop_duplicates("old_category")
                         reference_df_unique["Subtopic"],
                         reference_df_unique["Sentiment"],
                         reference_df,
+                        general_topic_series=None,
                         merge_general_topics="Yes",
                         merge_sentiment=merge_sentiment,
                         threshold=score_threshold
                     ).reset_index(drop=True)
             if deduplicated_topic_map_df['deduplicated_category'].isnull().all():
                 print("No deduplicated categories found, skipping the following code.")
             else:
             for prompt, summary, metadata, batch, model_choice, validated, group, task_type, file_name in zip(all_prompts_content, all_summaries_content, all_metadata_content, all_batches_content, all_model_choice_content, all_validated_content, all_groups_content, all_task_type_content, all_file_names_content)
         ]
+        if isinstance(existing_logged_content, pd.DataFrame):
+            existing_logged_content = existing_logged_content.to_dict(orient="records")
         out_logged_content = existing_logged_content + all_logged_content
         ### Save output files
         # Write overall outputs to csv
         overall_summary_output_csv_path = output_folder + batch_file_path_details + "_overall_summary_" + model_choice_clean_short + ".csv"
         summarised_outputs_df = pd.DataFrame(data={"Group":unique_groups, "Summary":summarised_outputs_for_df})
+        summarised_outputs_df.to_csv(overall_summary_output_csv_path, index=None, encoding='utf-8-sig')
         output_files.append(overall_summary_output_csv_path)
         summarised_outputs_df_for_display = pd.DataFrame(data={"Group":unique_groups, "Summary":summarised_outputs})
             for prompt, summary, metadata, batch, model_choice, validated, group, task_type, file_name in zip(all_prompts_content, all_summaries_content, all_metadata_content, all_batches_content, all_model_choice_content, all_validated_content, all_groups_content, all_task_type_content, all_file_names_content)
         ]
+        if isinstance(existing_logged_content, pd.DataFrame):
+            existing_logged_content = existing_logged_content.to_dict(orient="records")
         out_logged_content = existing_logged_content + all_logged_content
     return output_files, html_output_table, summarised_outputs_df, out_metadata_str, input_tokens_num, output_tokens_num, number_of_calls_num, time_taken, out_message, out_logged_content

tools/llm_api_call.py CHANGED Viewed

@@ -15,7 +15,7 @@ from typing import List, Tuple, Any
 from io import StringIO
 GradioFileData = gr.FileData
-from tools.prompts import initial_table_prompt, prompt2, prompt3, initial_table_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt,  force_existing_topics_prompt, allow_new_topics_prompt, force_single_topic_prompt, add_existing_topics_assistant_prefill, initial_table_assistant_prefill, structured_summary_prompt
 from tools.helper_functions import read_file, put_columns_in_df, wrap_text, initial_clean, load_in_data_file, load_in_file, create_topic_summary_df_from_reference_table, convert_reference_table_to_pivot_table, get_basic_response_data, clean_column_name, load_in_previous_data_files, create_batch_file_path_details, move_overall_summary_output_files_to_front_page
 from tools.llm_funcs import ResponseObject, construct_gemini_generative_model, call_llm_with_markdown_table_checks, create_missing_references_df, calculate_tokens_from_metadata, construct_azure_client, get_model, get_tokenizer, get_assistant_model
 from tools.config import RUN_LOCAL_MODEL, AWS_REGION, MAX_COMMENT_CHARS, MAX_OUTPUT_VALIDATION_ATTEMPTS, LLM_MAX_NEW_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, OUTPUT_FOLDER, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED, MAX_GROUPS, REASONING_SUFFIX, AZURE_INFERENCE_ENDPOINT, MAX_ROWS, MAXIMUM_ZERO_SHOT_TOPICS, MAX_SPACES_GPU_RUN_TIME, OUTPUT_DEBUG_FILES
@@ -352,9 +352,9 @@ def write_llm_output_and_logs(response_text: str,
     topic_table_out_path = "topic_table_error.csv"
     reference_table_out_path = "reference_table_error.csv"
     topic_summary_df_out_path = "unique_topic_table_error.csv"
-    topic_with_response_df = pd.DataFrame()
-    out_reference_df = pd.DataFrame()
-    out_topic_summary_df = pd.DataFrame()
     is_error = False # If there was an error in parsing, return boolean saying error
     # Convert conversation to string and add to log outputs
     whole_conversation_str = '\n'.join(whole_conversation)
@@ -385,6 +385,7 @@ def write_llm_output_and_logs(response_text: str,
         topic_with_response_df, is_error = convert_response_text_to_dataframe(response_text)
     except Exception as e:
         print("Error in parsing markdown table from response text:", e)
         return topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_with_response_df, out_reference_df, out_topic_summary_df, batch_file_path_details, is_error
     # Rename columns to ensure consistent use of data frames later in code
@@ -420,8 +421,11 @@ def write_llm_output_and_logs(response_text: str,
     for index, row in topic_with_response_df.iterrows():
         references = re.findall(r'\d+', str(row.iloc[3])) if pd.notna(row.iloc[3]) else []
         # If no numbers found in the Response References column, check the Summary column in case reference numbers were put there by mistake
-        if not references:
-            references = re.findall(r'\d+', str(row.iloc[4])) if pd.notna(row.iloc[4]) else []
         # Filter out references that are outside the valid range
         if references:
@@ -695,6 +699,7 @@ def extract_topics(in_data_file: GradioFileData,
               assistant_model:object=list(),
               max_rows:int=max_rows,
               original_full_file_name:str="",
               progress=Progress(track_tqdm=False)):
     '''
@@ -749,6 +754,7 @@ def extract_topics(in_data_file: GradioFileData,
     - assistant_model: Assistant model object for local inference.
     - max_rows: The maximum number of rows to process.
     - original_full_file_name: The original full file name.
     - progress (Progress): A progress tracker.
     '''
@@ -863,6 +869,9 @@ def extract_topics(in_data_file: GradioFileData,
             # Call the function to prepare the input table
             simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, latest_batch_completed, batch_size)
             # Conversation history
             conversation_history = list()
@@ -951,11 +960,15 @@ def extract_topics(in_data_file: GradioFileData,
                     # Format the summary prompt with the response table and topics
                     if produce_structures_summary_radio != "Yes":
                         formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table,
-                                                                                     topics=unique_topics_markdown,
-                                                                                     topic_assignment=topic_assignment_prompt, force_single_topic=force_single_topic_prompt, sentiment_choices=sentiment_prompt)
                     else:
                         formatted_summary_prompt = structured_summary_prompt.format(response_table=normalised_simple_markdown_table,
-                                                                                    topics=unique_topics_markdown)
                     full_prompt = formatted_system_prompt + "\n" + formatted_summary_prompt
@@ -997,7 +1010,7 @@ def extract_topics(in_data_file: GradioFileData,
                     ## Reference table mapping response numbers to topics
                     if output_debug_files == "True":
-                        new_reference_df.to_csv(reference_table_out_path, index=None)
                         out_file_paths.append(reference_table_out_path)
                     ## Unique topic list
@@ -1006,7 +1019,7 @@ def extract_topics(in_data_file: GradioFileData,
                     new_topic_summary_df["Group"] = group_name
                     if output_debug_files == "True":
-                        new_topic_summary_df.to_csv(topic_summary_df_out_path, index=None)
                         out_file_paths.append(topic_summary_df_out_path)
                     # Outputs for markdown table output
@@ -1039,7 +1052,8 @@ def extract_topics(in_data_file: GradioFileData,
                     # Format the summary prompt with the response table and topics
                     if produce_structures_summary_radio != "Yes":
-                        formatted_initial_table_prompt = initial_table_prompt.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt)
                     else:
                         unique_topics_markdown="No suggested headings for this summary"
                         formatted_initial_table_prompt = structured_summary_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown)
@@ -1076,7 +1090,7 @@ def extract_topics(in_data_file: GradioFileData,
                     if output_debug_files == "True":
                         # Output reference table
-                        reference_df.to_csv(reference_table_out_path, index=None)
                         out_file_paths.append(reference_table_out_path)
                     ## Unique topic list
@@ -1086,7 +1100,7 @@ def extract_topics(in_data_file: GradioFileData,
                     new_topic_summary_df["Group"] = group_name
                     if output_debug_files == "True":
-                        new_topic_summary_df.to_csv(topic_summary_df_out_path, index=None)
                         out_file_paths.append(topic_summary_df_out_path)
                     whole_conversation_metadata.append(whole_conversation_metadata_str)
@@ -1160,7 +1174,7 @@ def extract_topics(in_data_file: GradioFileData,
         basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
         ## Reference table mapping response numbers to topics
-        existing_reference_df.to_csv(reference_table_out_path, index=None)
         out_file_paths.append(reference_table_out_path)
         join_file_paths.append(reference_table_out_path)
@@ -1250,6 +1264,7 @@ def wrapper_extract_topics_per_column_value(
     azure_api_key_textbox:str="",
     output_folder: str = OUTPUT_FOLDER,
     existing_logged_content:list=list(),
     force_single_topic_prompt: str = force_single_topic_prompt,
     max_tokens: int = max_tokens,
     model_name_map: dict = model_name_map,
@@ -1304,6 +1319,7 @@ def wrapper_extract_topics_per_column_value(
     :param output_folder: The folder where output files will be saved.
     :param existing_logged_content: A list of existing logged content.
     :param force_single_topic_prompt: Prompt for forcing a single topic.
     :param max_tokens: Maximum tokens for LLM generation.
     :param model_name_map: Dictionary mapping model names to their properties.
     :param max_time_for_loop: Maximum time allowed for the processing loop.
@@ -1312,7 +1328,7 @@ def wrapper_extract_topics_per_column_value(
     :param model: Model object for local inference.
     :param tokenizer: Tokenizer object for local inference.
     :param assistant_model: Assistant model object for local inference.
-    :param max_rows: The maximum number of rows to process.
     :param progress: Gradio Progress object for tracking progress.
     :return: A tuple containing consolidated results, mimicking the return structure of `extract_topics`.
     """
@@ -1488,6 +1504,7 @@ def wrapper_extract_topics_per_column_value(
                 max_rows=max_rows,
                 existing_logged_content=all_logged_content,
                 original_full_file_name=original_file_name,
                 progress=progress
             )
@@ -1521,21 +1538,23 @@ def wrapper_extract_topics_per_column_value(
             # For now, it will continue
             continue
     if "Group" in acc_reference_df.columns:
-        model_choice_clean = model_name_map[model_choice]["short_name"]
-        model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
-        overall_file_name = clean_column_name(original_file_name, max_length=20)
-        column_clean = clean_column_name(chosen_cols, max_length=20)
         acc_reference_df_path = output_folder + overall_file_name + "_col_" + column_clean + "_all_final_reference_table_" + model_choice_clean_short + ".csv"
         acc_topic_summary_df_path = output_folder + overall_file_name + "_col_" + column_clean +  "_all_final_unique_topics_" + model_choice_clean_short + ".csv"
         acc_reference_df_pivot_path = output_folder + overall_file_name + "_col_" + column_clean +  "_all_final_reference_pivot_" + model_choice_clean_short + ".csv"
         acc_missing_df_path = output_folder + overall_file_name + "_col_" + column_clean + "_all_missing_df_" + model_choice_clean_short + ".csv"
-        acc_reference_df.to_csv(acc_reference_df_path, index=None)
-        acc_topic_summary_df.to_csv(acc_topic_summary_df_path, index=None)
-        acc_reference_df_pivot.to_csv(acc_reference_df_pivot_path, index=None)
-        acc_missing_df.to_csv(acc_missing_df_path, index=None)
         acc_log_files_output_paths.append(acc_missing_df_path)
@@ -1740,6 +1759,7 @@ def all_in_one_pipeline(
     model_name_map_state: dict = model_name_map,
     usage_logs_location: str = "",
     existing_logged_content:list=list(),
     model: object = None,
     tokenizer: object = None,
     assistant_model: object = None,
@@ -1749,7 +1769,60 @@ def all_in_one_pipeline(
     """
     Orchestrates the full All-in-one flow: extract → deduplicate → summarise → overall summary → Excel export.
-    Returns a large tuple matching the UI components updated during the original chained flow.
     """
     # Load local model if it's not already loaded
@@ -1830,7 +1903,8 @@ def all_in_one_pipeline(
         model=model,
         tokenizer=tokenizer,
         assistant_model=assistant_model,
-        max_rows=max_rows
     )
     total_input_tokens += out_input_tokens

 from io import StringIO
 GradioFileData = gr.FileData
+from tools.prompts import initial_table_prompt, prompt2, prompt3, initial_table_system_prompt, add_existing_topics_system_prompt, add_existing_topics_prompt,  force_existing_topics_prompt, allow_new_topics_prompt, force_single_topic_prompt, add_existing_topics_assistant_prefill, initial_table_assistant_prefill, structured_summary_prompt, default_response_reference_format, single_response_reference_format
 from tools.helper_functions import read_file, put_columns_in_df, wrap_text, initial_clean, load_in_data_file, load_in_file, create_topic_summary_df_from_reference_table, convert_reference_table_to_pivot_table, get_basic_response_data, clean_column_name, load_in_previous_data_files, create_batch_file_path_details, move_overall_summary_output_files_to_front_page
 from tools.llm_funcs import ResponseObject, construct_gemini_generative_model, call_llm_with_markdown_table_checks, create_missing_references_df, calculate_tokens_from_metadata, construct_azure_client, get_model, get_tokenizer, get_assistant_model
 from tools.config import RUN_LOCAL_MODEL, AWS_REGION, MAX_COMMENT_CHARS, MAX_OUTPUT_VALIDATION_ATTEMPTS, LLM_MAX_NEW_TOKENS, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, model_name_map, OUTPUT_FOLDER, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, LLM_SEED, MAX_GROUPS, REASONING_SUFFIX, AZURE_INFERENCE_ENDPOINT, MAX_ROWS, MAXIMUM_ZERO_SHOT_TOPICS, MAX_SPACES_GPU_RUN_TIME, OUTPUT_DEBUG_FILES
     topic_table_out_path = "topic_table_error.csv"
     reference_table_out_path = "reference_table_error.csv"
     topic_summary_df_out_path = "unique_topic_table_error.csv"
+    topic_with_response_df = pd.DataFrame(columns=["General topic", "Subtopic", "Sentiment", "Response References", "Summary"])
+    out_reference_df = pd.DataFrame(columns=["Response References", "General topic", "Subtopic", "Sentiment", "Summary", "Start row of group"])
+    out_topic_summary_df = pd.DataFrame(columns=["General topic", "Subtopic", "Sentiment"])
     is_error = False # If there was an error in parsing, return boolean saying error
     # Convert conversation to string and add to log outputs
     whole_conversation_str = '\n'.join(whole_conversation)
         topic_with_response_df, is_error = convert_response_text_to_dataframe(response_text)
     except Exception as e:
         print("Error in parsing markdown table from response text:", e)
         return topic_table_out_path, reference_table_out_path, topic_summary_df_out_path, topic_with_response_df, out_reference_df, out_topic_summary_df, batch_file_path_details, is_error
     # Rename columns to ensure consistent use of data frames later in code
     for index, row in topic_with_response_df.iterrows():
         references = re.findall(r'\d+', str(row.iloc[3])) if pd.notna(row.iloc[3]) else []
         # If no numbers found in the Response References column, check the Summary column in case reference numbers were put there by mistake
+        ##if not references:
+        #    references = re.findall(r'\d+', str(row.iloc[4])) if pd.notna(row.iloc[4]) else []
+        # If batch size is 1, references will always be 1
+        if batch_size_number == 1:
+            references = "1"
         # Filter out references that are outside the valid range
         if references:
               assistant_model:object=list(),
               max_rows:int=max_rows,
               original_full_file_name:str="",
+              add_existing_topics_summary_format:str="",
               progress=Progress(track_tqdm=False)):
     '''
     - assistant_model: Assistant model object for local inference.
     - max_rows: The maximum number of rows to process.
     - original_full_file_name: The original full file name.
+    - add_existing_topics_summary_format: Initial instructions to guide the format for the initial summary of the topics.
     - progress (Progress): A progress tracker.
     '''
             # Call the function to prepare the input table
             simplified_csv_table_path, normalised_simple_markdown_table, start_row, end_row, batch_basic_response_df = data_file_to_markdown_table(file_data, file_name, chosen_cols, latest_batch_completed, batch_size)
+            if batch_basic_response_df.shape[0] == 1: response_reference_format = single_response_reference_format
+            else: response_reference_format = default_response_reference_format
             # Conversation history
             conversation_history = list()
                     # Format the summary prompt with the response table and topics
                     if produce_structures_summary_radio != "Yes":
                         formatted_summary_prompt = add_existing_topics_prompt.format(response_table=normalised_simple_markdown_table,
+                            topics=unique_topics_markdown,
+                            topic_assignment=topic_assignment_prompt,
+                            force_single_topic=force_single_topic_prompt,
+                            sentiment_choices=sentiment_prompt,
+                            response_reference_format=response_reference_format,
+                            add_existing_topics_summary_format=add_existing_topics_summary_format)
                     else:
                         formatted_summary_prompt = structured_summary_prompt.format(response_table=normalised_simple_markdown_table,
+                        topics=unique_topics_markdown)
                     full_prompt = formatted_system_prompt + "\n" + formatted_summary_prompt
                     ## Reference table mapping response numbers to topics
                     if output_debug_files == "True":
+                        new_reference_df.to_csv(reference_table_out_path, index=None, encoding='utf-8-sig')
                         out_file_paths.append(reference_table_out_path)
                     ## Unique topic list
                     new_topic_summary_df["Group"] = group_name
                     if output_debug_files == "True":
+                        new_topic_summary_df.to_csv(topic_summary_df_out_path, index=None, encoding='utf-8-sig')
                         out_file_paths.append(topic_summary_df_out_path)
                     # Outputs for markdown table output
                     # Format the summary prompt with the response table and topics
                     if produce_structures_summary_radio != "Yes":
+                        formatted_initial_table_prompt = initial_table_prompt.format(response_table=normalised_simple_markdown_table, sentiment_choices=sentiment_prompt,
+                        response_reference_format=response_reference_format, add_existing_topics_summary_format=add_existing_topics_summary_format)
                     else:
                         unique_topics_markdown="No suggested headings for this summary"
                         formatted_initial_table_prompt = structured_summary_prompt.format(response_table=normalised_simple_markdown_table, topics=unique_topics_markdown)
                     if output_debug_files == "True":
                         # Output reference table
+                        reference_df.to_csv(reference_table_out_path, index=None, encoding='utf-8-sig')
                         out_file_paths.append(reference_table_out_path)
                     ## Unique topic list
                     new_topic_summary_df["Group"] = group_name
                     if output_debug_files == "True":
+                        new_topic_summary_df.to_csv(topic_summary_df_out_path, index=None, encoding='utf-8-sig')
                         out_file_paths.append(topic_summary_df_out_path)
                     whole_conversation_metadata.append(whole_conversation_metadata_str)
         basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean_short + "_temp_" + str(temperature) + ".csv"
         ## Reference table mapping response numbers to topics
+        existing_reference_df.to_csv(reference_table_out_path, index=None, encoding='utf-8-sig')
         out_file_paths.append(reference_table_out_path)
         join_file_paths.append(reference_table_out_path)
     azure_api_key_textbox:str="",
     output_folder: str = OUTPUT_FOLDER,
     existing_logged_content:list=list(),
+    add_existing_topics_summary_format:str="",
     force_single_topic_prompt: str = force_single_topic_prompt,
     max_tokens: int = max_tokens,
     model_name_map: dict = model_name_map,
     :param output_folder: The folder where output files will be saved.
     :param existing_logged_content: A list of existing logged content.
     :param force_single_topic_prompt: Prompt for forcing a single topic.
+    :param add_existing_topics_summary_format: Initial instructions to guide the format for the initial summary of the topics.
     :param max_tokens: Maximum tokens for LLM generation.
     :param model_name_map: Dictionary mapping model names to their properties.
     :param max_time_for_loop: Maximum time allowed for the processing loop.
     :param model: Model object for local inference.
     :param tokenizer: Tokenizer object for local inference.
     :param assistant_model: Assistant model object for local inference.
+    :param max_rows: The maximum number of rows to process.
     :param progress: Gradio Progress object for tracking progress.
     :return: A tuple containing consolidated results, mimicking the return structure of `extract_topics`.
     """
                 max_rows=max_rows,
                 existing_logged_content=all_logged_content,
                 original_full_file_name=original_file_name,
+                add_existing_topics_summary_format=add_existing_topics_summary_format,
                 progress=progress
             )
             # For now, it will continue
             continue
+    overall_file_name = clean_column_name(original_file_name, max_length=20)
+    model_choice_clean = model_name_map[model_choice]["short_name"]
+    model_choice_clean_short = clean_column_name(model_choice_clean, max_length=20, front_characters=False)
+    column_clean = clean_column_name(chosen_cols, max_length=20)
     if "Group" in acc_reference_df.columns:
         acc_reference_df_path = output_folder + overall_file_name + "_col_" + column_clean + "_all_final_reference_table_" + model_choice_clean_short + ".csv"
         acc_topic_summary_df_path = output_folder + overall_file_name + "_col_" + column_clean +  "_all_final_unique_topics_" + model_choice_clean_short + ".csv"
         acc_reference_df_pivot_path = output_folder + overall_file_name + "_col_" + column_clean +  "_all_final_reference_pivot_" + model_choice_clean_short + ".csv"
         acc_missing_df_path = output_folder + overall_file_name + "_col_" + column_clean + "_all_missing_df_" + model_choice_clean_short + ".csv"
+        acc_reference_df.to_csv(acc_reference_df_path, index=None, encoding='utf-8-sig')
+        acc_topic_summary_df.to_csv(acc_topic_summary_df_path, index=None, encoding='utf-8-sig')
+        acc_reference_df_pivot.to_csv(acc_reference_df_pivot_path, index=None, encoding='utf-8-sig')
+        acc_missing_df.to_csv(acc_missing_df_path, index=None, encoding='utf-8-sig')
         acc_log_files_output_paths.append(acc_missing_df_path)
     model_name_map_state: dict = model_name_map,
     usage_logs_location: str = "",
     existing_logged_content:list=list(),
+    add_existing_topics_summary_format:str="",
     model: object = None,
     tokenizer: object = None,
     assistant_model: object = None,
     """
     Orchestrates the full All-in-one flow: extract → deduplicate → summarise → overall summary → Excel export.
+    Args:
+        grouping_col (str): The column used for grouping data.
+        in_data_files (List[str]): List of input data file paths.
+        file_data (pd.DataFrame): The input data as a pandas DataFrame.
+        existing_topics_table (pd.DataFrame): DataFrame of existing topics.
+        existing_reference_df (pd.DataFrame): DataFrame of existing reference data.
+        existing_topic_summary_df (pd.DataFrame): DataFrame of existing topic summaries.
+        unique_table_df_display_table_markdown (str): Markdown string for displaying unique topics.
+        original_file_name (str): The original name of the input file.
+        total_number_of_batches (int): Total number of batches for processing.
+        in_api_key (str): API key for the LLM.
+        temperature (float): Temperature setting for the LLM.
+        chosen_cols (List[str]): List of columns chosen for analysis.
+        model_choice (str): The chosen LLM model.
+        candidate_topics (GradioFileData): Gradio file data for candidate topics.
+        first_loop_state (bool): State indicating if it's the first loop.
+        conversation_metadata_text (str): Text containing conversation metadata.
+        latest_batch_completed (int): The latest batch number completed.
+        time_taken_so_far (float): Cumulative time taken so far.
+        initial_table_prompt_text (str): Initial prompt text for table generation.
+        initial_table_system_prompt_text (str): Initial system prompt text for table generation.
+        add_existing_topics_system_prompt_text (str): System prompt for adding existing topics.
+        add_existing_topics_prompt_text (str): Prompt for adding existing topics.
+        number_of_prompts_used (int): Number of prompts used in sequence.
+        batch_size (int): Size of each processing batch.
+        context_text (str): Additional context for the LLM.
+        sentiment_choice (str): Choice for sentiment analysis (e.g., "Yes", "No").
+        force_zero_shot_choice (str): Choice to force zero-shot prompting.
+        in_excel_sheets (List[str]): List of sheet names in the input Excel file.
+        force_single_topic_choice (str): Choice to force single topic extraction.
+        produce_structures_summary_choice (str): Choice to produce structured summaries.
+        aws_access_key_text (str): AWS access key.
+        aws_secret_key_text (str): AWS secret key.
+        hf_api_key_text (str): Hugging Face API key.
+        azure_api_key_text (str): Azure API key.
+        output_folder (str, optional): Folder to save output files. Defaults to OUTPUT_FOLDER.
+        merge_sentiment (str, optional): Whether to merge sentiment. Defaults to "No".
+        merge_general_topics (str, optional): Whether to merge general topics. Defaults to "Yes".
+        score_threshold (int, optional): Score threshold for topic matching. Defaults to 90.
+        summarise_format (str, optional): Format for summarization. Defaults to "".
+        random_seed (int, optional): Random seed for reproducibility. Defaults to 42.
+        log_files_output_list_state (List[str], optional): List of log file paths. Defaults to list().
+        model_name_map_state (dict, optional): Mapping of model names. Defaults to model_name_map.
+        usage_logs_location (str, optional): Location for usage logs. Defaults to "".
+        existing_logged_content (list, optional): Existing logged content. Defaults to list().
+        add_existing_topics_summary_format (str, optional): Summary format for adding existing topics. Defaults to "".
+        model (object, optional): Loaded local model object. Defaults to None.
+        tokenizer (object, optional): Loaded local tokenizer object. Defaults to None.
+        assistant_model (object, optional): Loaded local assistant model object. Defaults to None.
+        max_rows (int, optional): Maximum number of rows to process. Defaults to max_rows.
+        progress (Progress, optional): Gradio Progress object for tracking. Defaults to Progress(track_tqdm=True).
+    Returns:
+        A tuple matching the UI components updated during the original chained flow.
     """
     # Load local model if it's not already loaded
         model=model,
         tokenizer=tokenizer,
         assistant_model=assistant_model,
+        max_rows=max_rows,
+        add_existing_topics_summary_format=add_existing_topics_summary_format
     )
     total_input_tokens += out_input_tokens

tools/llm_funcs.py CHANGED Viewed

@@ -4,14 +4,11 @@ import re
 import time
 import boto3
 import pandas as pd
-import json
-import spaces
 from tqdm import tqdm
 from huggingface_hub import hf_hub_download
 from typing import List, Tuple, TypeVar
 from google import genai as ai
 from google.genai import types
-import gradio as gr
 from gradio import Progress
 from azure.ai.inference import ChatCompletionsClient
@@ -26,15 +23,12 @@ _model = None
 _tokenizer = None
 _assistant_model = None
-from tools.config import AWS_REGION, LLM_TEMPERATURE, LLM_TOP_K, LLM_MIN_P, LLM_TOP_P, LLM_REPETITION_PENALTY, LLM_LAST_N_TOKENS, LLM_MAX_NEW_TOKENS, LLM_SEED, LLM_RESET, LLM_STREAM, LLM_THREADS, LLM_BATCH_SIZE, LLM_CONTEXT_LENGTH, LLM_SAMPLE, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, MAX_COMMENT_CHARS, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, HF_TOKEN, LLM_SEED, LLM_MAX_GPU_LAYERS, SPECULATIVE_DECODING, NUM_PRED_TOKENS, USE_LLAMA_CPP, COMPILE_MODE, MODEL_DTYPE, USE_BITSANDBYTES, COMPILE_TRANSFORMERS, INT8_WITH_OFFLOAD_TO_CPU, AZURE_INFERENCE_ENDPOINT, LOAD_LOCAL_MODEL_AT_START, USE_SPECULATIVE_DECODING, ASSISTANT_MODEL, LLM_STOP_STRINGS, LLM_MAX_NEW_TOKENS
-from tools.prompts import initial_table_assistant_prefill
 from tools.helper_functions import _get_env_list
 if SPECULATIVE_DECODING == "True": SPECULATIVE_DECODING = True
 else: SPECULATIVE_DECODING = False
-if USE_SPECULATIVE_DECODING == "True": USE_SPECULATIVE_DECODING = True
-else: USE_SPECULATIVE_DECODING = False
 if isinstance(NUM_PRED_TOKENS, str): NUM_PRED_TOKENS = int(NUM_PRED_TOKENS)
 if isinstance(LLM_MAX_GPU_LAYERS, str): LLM_MAX_GPU_LAYERS = int(LLM_MAX_GPU_LAYERS)
@@ -186,6 +180,7 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
     compile_mode=COMPILE_MODE,
     model_dtype=MODEL_DTYPE,
     hf_token=HF_TOKEN,
     model=None,
     tokenizer=None,
     assistant_model=None):
@@ -205,6 +200,7 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
         compile_mode (str): The compilation mode to use for the model.
         model_dtype (str): The data type to use for the model.
         hf_token (str): The Hugging Face token to use for the model.
         model (Llama/transformers model): The model to load.
         tokenizer (list/transformers tokenizer): The tokenizer to load.
         assistant_model (transformers model): The assistant model for speculative decoding.
@@ -212,7 +208,7 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
         tuple: A tuple containing:
             - model (Llama/transformers model): The loaded Llama.cpp/transformers model instance.
             - tokenizer (list/transformers tokenizer): An empty list (tokenizer is not used with Llama.cpp directly in this setup), or a transformers tokenizer.
-            - assistant_model (transformers model): The assistant model for speculative decoding (if USE_SPECULATIVE_DECODING is True).
     '''
     if model:
@@ -263,9 +259,9 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
             try:
                 print("GPU load variables:" , vars(gpu_config))
                 if speculative_decoding:
-                    model = Llama(model_path=model_path, type_k=8, type_v=8, flash_attn=True, draft_model=LlamaPromptLookupDecoding(num_pred_tokens=NUM_PRED_TOKENS), **vars(gpu_config))
                 else:
-                    model = Llama(model_path=model_path, type_k=8, type_v=8, flash_attn=True, **vars(gpu_config))
             except Exception as e:
                 print("GPU load failed due to:", e, "Loading model in CPU mode")
@@ -397,7 +393,7 @@ def load_model(local_model_type:str=CHOSEN_LOCAL_MODEL_TYPE,
     print("GPU layers assigned to cuda:", gpu_layers)
     # Load assistant model for speculative decoding if enabled
-    if USE_SPECULATIVE_DECODING and USE_LLAMA_CPP == "False" and torch_device == "cuda":
         print("Loading assistant model for speculative decoding:", ASSISTANT_MODEL)
         try:
             from transformers import AutoModelForCausalLM
@@ -764,7 +760,7 @@ def call_aws_claude(prompt: str, system_prompt: str, temperature: float, max_tok
     return response
-def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCPPGenerationConfig, model=None, tokenizer=None, assistant_model=None, progress=Progress(track_tqdm=False)):
     """
     This function sends a request to a transformers model (through Unsloth) with the given prompt, system prompt, and generation configuration.
     """
@@ -774,7 +770,7 @@ def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCP
         model = get_model()
     if tokenizer is None:
         tokenizer = get_tokenizer()
-    if assistant_model is None and USE_SPECULATIVE_DECODING:
         assistant_model = get_assistant_model()
     if model is None or tokenizer is None:
@@ -784,10 +780,17 @@ def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCP
     def wrap_text_message(text):
         return [{"type": "text", "text": text}]
-    conversation = [
-        {"role": "system", "content": wrap_text_message(system_prompt)},
-        {"role": "user", "content": wrap_text_message(prompt)}
-    ]
     #print("Conversation:", conversation)
     #import pprint
     #pprint.pprint(conversation)
@@ -812,7 +815,7 @@ def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCP
     # Map LlamaCPP parameters to transformers parameters
     generation_kwargs = {
-        'LLM_MAX_NEW_TOKENS': gen_config.max_tokens,
         'temperature': gen_config.temperature,
         'top_p': gen_config.top_p,
         'top_k': gen_config.top_k,
@@ -834,7 +837,7 @@ def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCP
     start_time = time.time()
     # Use speculative decoding if assistant model is available
-    if USE_SPECULATIVE_DECODING and assistant_model is not None:
         print("Using speculative decoding with assistant model")
         outputs = model.generate(
             input_ids,
@@ -853,7 +856,7 @@ def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCP
     end_time = time.time()
     # --- Decode and Display Results ---
-    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
     # To get only the model's reply, we can decode just the newly generated tokens
     new_tokens = outputs[0][input_ids.shape[-1]:]
     assistant_reply = tokenizer.decode(new_tokens, skip_special_tokens=True)
@@ -883,6 +886,7 @@ def send_request(prompt: str, conversation_history: List[dict], google_client: a
     full_prompt = "Conversation history:\n"
     num_transformer_input_tokens = 0
     num_transformer_generated_tokens = 0
     for entry in conversation_history:
         role = entry['role'].capitalize()  # Assuming the history is stored with 'role' and 'parts'
@@ -915,7 +919,7 @@ def send_request(prompt: str, conversation_history: List[dict], google_client: a
                 time.sleep(timeout_wait)
             if i == number_of_api_retry_attempts:
-                return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
     elif "AWS" in model_source:
         for i in progress_bar:
@@ -931,7 +935,7 @@ def send_request(prompt: str, conversation_history: List[dict], google_client: a
                 time.sleep(timeout_wait)
             if i == number_of_api_retry_attempts:
-                return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
     elif "Azure" in model_source:
         for i in progress_bar:
             try:
@@ -960,7 +964,7 @@ def send_request(prompt: str, conversation_history: List[dict], google_client: a
                 print("Call to Azure model failed:", e, " Waiting for ", str(timeout_wait), "seconds and trying again.")
                 time.sleep(timeout_wait)
             if i == number_of_api_retry_attempts:
-                return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
     elif "Local" in model_source:
         # This is the local model
         for i in progress_bar:
@@ -986,10 +990,10 @@ def send_request(prompt: str, conversation_history: List[dict], google_client: a
                 time.sleep(timeout_wait)
             if i == number_of_api_retry_attempts:
-                return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
     else:
         print("Model source not recognised")
-        return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history
     # Update the conversation history with the new prompt and response
     conversation_history.append({'role': 'user', 'parts': [prompt]})
@@ -998,19 +1002,17 @@ def send_request(prompt: str, conversation_history: List[dict], google_client: a
     if isinstance(response, ResponseObject):
         response_text = response.text
     elif 'choices' in response: # LLama.cpp model response
-        if "gpt-oss" in model_choice:
-            response_text = response['choices'][0]['message']['content'].split('<|start|>assistant<|channel|>final<|message|>')[1]
-        else:
-            response_text = response['choices'][0]['message']['content']
-        response_text = response_text.strip()
     elif model_source == "Gemini":
         response_text = response.text
-        response_text = response_text.strip()
     else: # Assume transformers model response
-        if "gpt-oss" in model_choice:
-            response_text = response.split('<|start|>assistant<|channel|>final<|message|>')[1]
-        else:
-            response_text = response
     conversation_history.append({'role': 'assistant', 'parts': [response_text]})

 import time
 import boto3
 import pandas as pd
 from tqdm import tqdm
 from huggingface_hub import hf_hub_download
 from typing import List, Tuple, TypeVar
 from google import genai as ai
 from google.genai import types
 from gradio import Progress
 from azure.ai.inference import ChatCompletionsClient
 _tokenizer = None
 _assistant_model = None
+from tools.config import LLM_TEMPERATURE, LLM_TOP_K, LLM_MIN_P, LLM_TOP_P, LLM_REPETITION_PENALTY, LLM_LAST_N_TOKENS, LLM_MAX_NEW_TOKENS, LLM_SEED, LLM_RESET, LLM_STREAM, LLM_THREADS, LLM_BATCH_SIZE, LLM_CONTEXT_LENGTH, LLM_SAMPLE, TIMEOUT_WAIT, NUMBER_OF_RETRY_ATTEMPTS, MAX_TIME_FOR_LOOP, BATCH_SIZE_DEFAULT, DEDUPLICATION_THRESHOLD, MAX_COMMENT_CHARS, CHOSEN_LOCAL_MODEL_TYPE, LOCAL_REPO_ID, LOCAL_MODEL_FILE, LOCAL_MODEL_FOLDER, HF_TOKEN, LLM_SEED, LLM_MAX_GPU_LAYERS, SPECULATIVE_DECODING, NUM_PRED_TOKENS, USE_LLAMA_CPP, COMPILE_MODE, MODEL_DTYPE, USE_BITSANDBYTES, COMPILE_TRANSFORMERS, INT8_WITH_OFFLOAD_TO_CPU, LOAD_LOCAL_MODEL_AT_START, ASSISTANT_MODEL, LLM_STOP_STRINGS, MULTIMODAL_PROMPT_FORMAT, KV_QUANT_LEVEL
 from tools.helper_functions import _get_env_list
 if SPECULATIVE_DECODING == "True": SPECULATIVE_DECODING = True
 else: SPECULATIVE_DECODING = False
 if isinstance(NUM_PRED_TOKENS, str): NUM_PRED_TOKENS = int(NUM_PRED_TOKENS)
 if isinstance(LLM_MAX_GPU_LAYERS, str): LLM_MAX_GPU_LAYERS = int(LLM_MAX_GPU_LAYERS)
     compile_mode=COMPILE_MODE,
     model_dtype=MODEL_DTYPE,
     hf_token=HF_TOKEN,
+    speculative_decoding=speculative_decoding,
     model=None,
     tokenizer=None,
     assistant_model=None):
         compile_mode (str): The compilation mode to use for the model.
         model_dtype (str): The data type to use for the model.
         hf_token (str): The Hugging Face token to use for the model.
+        speculative_decoding (bool): Whether to use speculative decoding.
         model (Llama/transformers model): The model to load.
         tokenizer (list/transformers tokenizer): The tokenizer to load.
         assistant_model (transformers model): The assistant model for speculative decoding.
         tuple: A tuple containing:
             - model (Llama/transformers model): The loaded Llama.cpp/transformers model instance.
             - tokenizer (list/transformers tokenizer): An empty list (tokenizer is not used with Llama.cpp directly in this setup), or a transformers tokenizer.
+            - assistant_model (transformers model): The assistant model for speculative decoding (if speculative_decoding is True).
     '''
     if model:
             try:
                 print("GPU load variables:" , vars(gpu_config))
                 if speculative_decoding:
+                    model = Llama(model_path=model_path, type_k=KV_QUANT_LEVEL, type_v=KV_QUANT_LEVEL, flash_attn=True, draft_model=LlamaPromptLookupDecoding(num_pred_tokens=NUM_PRED_TOKENS), **vars(gpu_config))
                 else:
+                    model = Llama(model_path=model_path, type_k=KV_QUANT_LEVEL, type_v=KV_QUANT_LEVEL, flash_attn=True, **vars(gpu_config))
             except Exception as e:
                 print("GPU load failed due to:", e, "Loading model in CPU mode")
     print("GPU layers assigned to cuda:", gpu_layers)
     # Load assistant model for speculative decoding if enabled
+    if speculative_decoding and USE_LLAMA_CPP == "False" and torch_device == "cuda":
         print("Loading assistant model for speculative decoding:", ASSISTANT_MODEL)
         try:
             from transformers import AutoModelForCausalLM
     return response
+def call_transformers_model(prompt: str, system_prompt: str, gen_config: LlamaCPPGenerationConfig, model=None, tokenizer=None, assistant_model=None, speculative_decoding=speculative_decoding, progress=Progress(track_tqdm=False)):
     """
     This function sends a request to a transformers model (through Unsloth) with the given prompt, system prompt, and generation configuration.
     """
         model = get_model()
     if tokenizer is None:
         tokenizer = get_tokenizer()
+    if assistant_model is None and speculative_decoding:
         assistant_model = get_assistant_model()
     if model is None or tokenizer is None:
     def wrap_text_message(text):
         return [{"type": "text", "text": text}]
+    if MULTIMODAL_PROMPT_FORMAT == "True":
+        conversation = [
+            {"role": "system", "content": wrap_text_message(system_prompt)},
+            {"role": "user", "content": wrap_text_message(prompt)}
+        ]
+    else:
+        conversation = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": prompt}
+        ]
     #print("Conversation:", conversation)
     #import pprint
     #pprint.pprint(conversation)
     # Map LlamaCPP parameters to transformers parameters
     generation_kwargs = {
+        'max_new_tokens': gen_config.max_tokens,
         'temperature': gen_config.temperature,
         'top_p': gen_config.top_p,
         'top_k': gen_config.top_k,
     start_time = time.time()
     # Use speculative decoding if assistant model is available
+    if speculative_decoding and assistant_model is not None:
         print("Using speculative decoding with assistant model")
         outputs = model.generate(
             input_ids,
     end_time = time.time()
     # --- Decode and Display Results ---
+    #generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
     # To get only the model's reply, we can decode just the newly generated tokens
     new_tokens = outputs[0][input_ids.shape[-1]:]
     assistant_reply = tokenizer.decode(new_tokens, skip_special_tokens=True)
     full_prompt = "Conversation history:\n"
     num_transformer_input_tokens = 0
     num_transformer_generated_tokens = 0
+    response_text = ""
     for entry in conversation_history:
         role = entry['role'].capitalize()  # Assuming the history is stored with 'role' and 'parts'
                 time.sleep(timeout_wait)
             if i == number_of_api_retry_attempts:
+                return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history, response_text, num_transformer_input_tokens, num_transformer_generated_tokens
     elif "AWS" in model_source:
         for i in progress_bar:
                 time.sleep(timeout_wait)
             if i == number_of_api_retry_attempts:
+                return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history, response_text, num_transformer_input_tokens, num_transformer_generated_tokens
     elif "Azure" in model_source:
         for i in progress_bar:
             try:
                 print("Call to Azure model failed:", e, " Waiting for ", str(timeout_wait), "seconds and trying again.")
                 time.sleep(timeout_wait)
             if i == number_of_api_retry_attempts:
+                return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history, response_text, num_transformer_input_tokens, num_transformer_generated_tokens
     elif "Local" in model_source:
         # This is the local model
         for i in progress_bar:
                 time.sleep(timeout_wait)
             if i == number_of_api_retry_attempts:
+                return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history, response_text, num_transformer_input_tokens, num_transformer_generated_tokens
     else:
         print("Model source not recognised")
+        return ResponseObject(text="", usage_metadata={'RequestId':"FAILED"}), conversation_history, response_text, num_transformer_input_tokens, num_transformer_generated_tokens
     # Update the conversation history with the new prompt and response
     conversation_history.append({'role': 'user', 'parts': [prompt]})
     if isinstance(response, ResponseObject):
         response_text = response.text
     elif 'choices' in response: # LLama.cpp model response
+        if "gpt-oss" in model_choice: response_text = response['choices'][0]['message']['content'].split('<|start|>assistant<|channel|>final<|message|>')[1]
+        else: response_text = response['choices'][0]['message']['content']
     elif model_source == "Gemini":
         response_text = response.text
     else: # Assume transformers model response
+        if "gpt-oss" in model_choice: response_text = response.split('<|start|>assistant<|channel|>final<|message|>')[1]
+        else: response_text = response
+    # Replace multiple spaces with single space
+    response_text = re.sub(r' {2,}', ' ', response_text)
+    response_text = response_text.strip()
     conversation_history.append({'role': 'assistant', 'parts': [response_text]})

tools/prompts.py CHANGED Viewed

@@ -8,12 +8,16 @@ initial_table_system_prompt = system_prompt + markdown_additional_prompt
 initial_table_assistant_prefill = "|"
 initial_table_prompt = """Your task is to create one new markdown table based on open text responses in the reponse table below with the headings 'General topic', 'Subtopic', 'Sentiment', 'Response References', and 'Summary'.
 In the first column identify general topics relevant to responses. Create as many general topics as you can.
 In the second column list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned. The subtopic should never be blank or empty.
 {sentiment_choices}.
-In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
-In the fifth column, write a short summary of the subtopic based on relevant responses - highlight specific issues that appear.
 Do not add any other columns. Do not add any other text to your response.
 Response table:
@@ -46,8 +50,8 @@ force_single_topic_prompt = """ Assign each response to one single topic only.""
 add_existing_topics_prompt = """Your task is to create one new markdown table, assigning responses from the Response table below to topics.
 {topic_assignment}{force_single_topic}
 {sentiment_choices}.
-In the fourth column list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column.
-In the fifth column, write a short summary of the Subtopic based on relevant responses - highlight specific issues that appear.
 Do not add any other columns. Do not add any other text to your response.
 Responses are shown in the following Response table:

 initial_table_assistant_prefill = "|"
+default_response_reference_format = "list each specific Response reference number that is relevant to the Subtopic, separated by commas. Do no write any other text in this column."
+single_response_reference_format = "'Response References' write the number 1 alongside each subtopic and no other text."
 initial_table_prompt = """Your task is to create one new markdown table based on open text responses in the reponse table below with the headings 'General topic', 'Subtopic', 'Sentiment', 'Response References', and 'Summary'.
 In the first column identify general topics relevant to responses. Create as many general topics as you can.
 In the second column list subtopics relevant to responses. Make the subtopics as specific as possible and make sure they cover every issue mentioned. The subtopic should never be blank or empty.
 {sentiment_choices}.
+In the fourth column {response_reference_format}
+In the fifth column, write a summary of the subtopic based on relevant responses - highlight specific issues that appear. {add_existing_topics_summary_format}
 Do not add any other columns. Do not add any other text to your response.
 Response table:
 add_existing_topics_prompt = """Your task is to create one new markdown table, assigning responses from the Response table below to topics.
 {topic_assignment}{force_single_topic}
 {sentiment_choices}.
+In the fourth column {response_reference_format}
+In the fifth column, write a summary of the Subtopic based on relevant responses - highlight specific issues that appear. {add_existing_topics_summary_format}
 Do not add any other columns. Do not add any other text to your response.
 Responses are shown in the following Response table:

tools/verify_titles.py CHANGED Viewed

@@ -492,17 +492,17 @@ def verify_titles(in_data_file,
                     # Write outputs to csv
                     ## Topics with references
-                    new_topic_df.to_csv(topic_table_out_path, index=None)
                     log_files_output_paths.append(topic_table_out_path)
                     ## Reference table mapping response numbers to topics
-                    new_reference_df.to_csv(reference_table_out_path, index=None)
                     out_file_paths.append(reference_table_out_path)
                     ## Unique topic list
                     new_unique_topics_df = pd.concat([new_unique_topics_df, existing_unique_topics_df]) #.drop_duplicates('Subtopic')
-                    new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
                     out_file_paths.append(unique_topics_df_out_path)
                     # Outputs for markdown table output
@@ -536,7 +536,7 @@ def verify_titles(in_data_file,
                     formatted_initial_table_system_prompt = system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
-                    formatted_initial_table_prompt = initial_table_prompt.format(response_table=normalised_simple_markdown_table)
                     if prompt2: formatted_prompt2 = prompt2.format(response_table=normalised_simple_markdown_table)
                     else: formatted_prompt2 = prompt2
@@ -561,16 +561,16 @@ def verify_titles(in_data_file,
                     # If error in table parsing, leave function
                     if is_error == True: raise Exception("Error in output table parsing")
-                    topic_table_df.to_csv(topic_table_out_path, index=None)
                     out_file_paths.append(topic_table_out_path)
-                    reference_df.to_csv(reference_table_out_path, index=None)
                     out_file_paths.append(reference_table_out_path)
                     ## Unique topic list
                     new_unique_topics_df = pd.concat([new_unique_topics_df, existing_unique_topics_df])
-                    new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
                     out_file_paths.append(unique_topics_df_out_path)
                     whole_conversation_metadata.append(whole_conversation_metadata_str)
@@ -672,14 +672,14 @@ def verify_titles(in_data_file,
         basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
         ## Reference table mapping response numbers to topics
-        existing_reference_df.to_csv(reference_table_out_path, index=None)
         out_file_paths.append(reference_table_out_path)
         # Create final unique topics table from reference table to ensure consistent numbers
         final_out_unique_topics_df = existing_unique_topics_df #create_topic_summary_df_from_reference_table(existing_reference_df)
         ## Unique topic list
-        final_out_unique_topics_df.to_csv(unique_topics_df_out_path, index=None)
         out_file_paths.append(unique_topics_df_out_path)
         # Ensure that we are only returning the final results to outputs
@@ -696,7 +696,7 @@ def verify_titles(in_data_file,
         basic_response_data = get_basic_response_data(file_data, chosen_cols, verify_titles=True)
         # Save simplified file data to log outputs
-        pd.DataFrame(basic_response_data).to_csv(basic_response_data_out_path, index=None)
         log_files_output_paths.append(basic_response_data_out_path)
         # Step 1: Identify missing references
@@ -713,7 +713,7 @@ def verify_titles(in_data_file,
         #print("missing_df:", missing_df)
         missing_df_out_path = output_folder + file_path_details + "_missing_references_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
-        missing_df.to_csv(missing_df_out_path, index=None)
         log_files_output_paths.append(missing_df_out_path)
         out_file_paths = list(set(out_file_paths))

                     # Write outputs to csv
                     ## Topics with references
+                    new_topic_df.to_csv(topic_table_out_path, index=None, encoding='utf-8-sig')
                     log_files_output_paths.append(topic_table_out_path)
                     ## Reference table mapping response numbers to topics
+                    new_reference_df.to_csv(reference_table_out_path, index=None, encoding='utf-8-sig')
                     out_file_paths.append(reference_table_out_path)
                     ## Unique topic list
                     new_unique_topics_df = pd.concat([new_unique_topics_df, existing_unique_topics_df]) #.drop_duplicates('Subtopic')
+                    new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None, encoding='utf-8-sig')
                     out_file_paths.append(unique_topics_df_out_path)
                     # Outputs for markdown table output
                     formatted_initial_table_system_prompt = system_prompt.format(consultation_context=context_textbox, column_name=chosen_cols)
+                    formatted_initial_table_prompt = initial_table_prompt.format(response_table=normalised_simple_markdown_table, add_existing_topics_summary_format=add_existing_topics_summary_format)
                     if prompt2: formatted_prompt2 = prompt2.format(response_table=normalised_simple_markdown_table)
                     else: formatted_prompt2 = prompt2
                     # If error in table parsing, leave function
                     if is_error == True: raise Exception("Error in output table parsing")
+                    topic_table_df.to_csv(topic_table_out_path, index=None, encoding='utf-8-sig')
                     out_file_paths.append(topic_table_out_path)
+                    reference_df.to_csv(reference_table_out_path, index=None, encoding='utf-8-sig')
                     out_file_paths.append(reference_table_out_path)
                     ## Unique topic list
                     new_unique_topics_df = pd.concat([new_unique_topics_df, existing_unique_topics_df])
+                    new_unique_topics_df.to_csv(unique_topics_df_out_path, index=None, encoding='utf-8-sig')
                     out_file_paths.append(unique_topics_df_out_path)
                     whole_conversation_metadata.append(whole_conversation_metadata_str)
         basic_response_data_out_path = output_folder + file_path_details + "_simplified_data_file_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
         ## Reference table mapping response numbers to topics
+        existing_reference_df.to_csv(reference_table_out_path, index=None, encoding='utf-8-sig')
         out_file_paths.append(reference_table_out_path)
         # Create final unique topics table from reference table to ensure consistent numbers
         final_out_unique_topics_df = existing_unique_topics_df #create_topic_summary_df_from_reference_table(existing_reference_df)
         ## Unique topic list
+        final_out_unique_topics_df.to_csv(unique_topics_df_out_path, index=None, encoding='utf-8-sig')
         out_file_paths.append(unique_topics_df_out_path)
         # Ensure that we are only returning the final results to outputs
         basic_response_data = get_basic_response_data(file_data, chosen_cols, verify_titles=True)
         # Save simplified file data to log outputs
+        pd.DataFrame(basic_response_data).to_csv(basic_response_data_out_path, index=None, encoding='utf-8-sig')
         log_files_output_paths.append(basic_response_data_out_path)
         # Step 1: Identify missing references
         #print("missing_df:", missing_df)
         missing_df_out_path = output_folder + file_path_details + "_missing_references_" + model_choice_clean + "_temp_" + str(temperature) + ".csv"
+        missing_df.to_csv(missing_df_out_path, index=None, encoding='utf-8-sig')
         log_files_output_paths.append(missing_df_out_path)
         out_file_paths = list(set(out_file_paths))

windows_install_llama-cpp-python.txt CHANGED Viewed

@@ -77,13 +77,15 @@ set PKG_CONFIG_PATH=C:\<path-to-openblas>\OpenBLAS\lib\pkgconfig # Set this in e
 pip install llama-cpp-python==0.3.16 --force-reinstall --verbose --no-cache-dir -Ccmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS;-DBLAS_INCLUDE_DIRS=C:/<path-to-openblas>/OpenBLAS/include;-DBLAS_LIBRARIES=C:/<path-to-openblas>/OpenBLAS/lib/libopenblas.lib"
 or to make a wheel:
 pip install llama-cpp-python==0.3.16 --wheel-dir dist --verbose --no-cache-dir -Ccmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS;-DBLAS_INCLUDE_DIRS=C:/<path-to-openblas>/OpenBLAS/include;-DBLAS_LIBRARIES=C:/<path-to-openblas>/OpenBLAS/lib/libopenblas.lib"
-pip wheel llama-cpp-python==0.3.16 --wheel-dir dist --verbose --no-cache-dir -Ccmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS;-DBLAS_INCLUDE_DIRS=C:/Users/spedrickcase/libs/OpenBLAS/include;-DBLAS_LIBRARIES=C:/Users/spedrickcase/libs/OpenBLAS/lib/libopenblas.lib"
-C:/Users/spedrickcase/libs
 ## With Cuda (NVIDIA GPUs only)

 pip install llama-cpp-python==0.3.16 --force-reinstall --verbose --no-cache-dir -Ccmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS;-DBLAS_INCLUDE_DIRS=C:/<path-to-openblas>/OpenBLAS/include;-DBLAS_LIBRARIES=C:/<path-to-openblas>/OpenBLAS/lib/libopenblas.lib"
+pip install llama-cpp-python==0.3.16 --verbose --no-cache-dir -Ccmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS;-DBLAS_INCLUDE_DIRS=C:/Users/s_cas/libs/OpenBLAS/include;-DBLAS_LIBRARIES=C:/Users/s_cas/OpenBLAS/lib/libopenblas.lib";-DPKG_CONFIG_PATH=C:/users/s_cas/openblas/lib/pkgconfig"
 or to make a wheel:
 pip install llama-cpp-python==0.3.16 --wheel-dir dist --verbose --no-cache-dir -Ccmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS;-DBLAS_INCLUDE_DIRS=C:/<path-to-openblas>/OpenBLAS/include;-DBLAS_LIBRARIES=C:/<path-to-openblas>/OpenBLAS/lib/libopenblas.lib"
+pip wheel llama-cpp-python==0.3.16 --wheel-dir dist --verbose --no-cache-dir -Ccmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS;-DBLAS_INCLUDE_DIRS=C:/Users/<user>/libs/OpenBLAS/include;-DBLAS_LIBRARIES=C:/Users/<user>/libs/OpenBLAS/lib/libopenblas.lib"
 ## With Cuda (NVIDIA GPUs only)