|
{ |
|
"model": { |
|
"bos_token_id": 128000, |
|
"context_length": 4096, |
|
"decoder": { |
|
"session_options": { |
|
"log_id": "onnxruntime-genai", |
|
"provider_options": [] |
|
}, |
|
"head_size": 128, |
|
"hidden_size": 3072, |
|
"inputs": { |
|
"input_ids": "input_ids", |
|
"attention_mask": "attention_mask_dummy", |
|
"position_ids": "position_ids_dummy", |
|
"past_key_names": "past_key_%d_in", |
|
"past_value_names": "past_value_%d_in" |
|
}, |
|
"outputs": { |
|
"logits": "logits_dequantized", |
|
"present_key_names": "past_key_%d_out", |
|
"present_value_names": "past_value_%d_out" |
|
}, |
|
"num_attention_heads": 24, |
|
"num_hidden_layers": 28, |
|
"num_key_value_heads": 8, |
|
"sliding_window_key_value_cache": { |
|
"window_size": 128, |
|
"pad_value": 128 |
|
}, |
|
"pipeline": [ |
|
{ |
|
"position-processor": { |
|
"filename": "position-processor.onnx", |
|
"inputs": [ |
|
"attention_mask_before_processor", |
|
"position_ids" |
|
], |
|
"outputs": [ |
|
"attention_mask_before_quantizer", |
|
"position_ids_cos_before_quantizer", |
|
"position_ids_sin_before_quantizer" |
|
], |
|
"session_options": { |
|
"log_id": "onnxruntime-genai.position_processor", |
|
"provider_options": [ |
|
{} |
|
] |
|
}, |
|
"run_on_token_gen": false |
|
}, |
|
"position-shifter": { |
|
"filename": "position-shifter.onnx", |
|
"inputs": [ |
|
"attention_mask_before_processor", |
|
"position_ids" |
|
], |
|
"outputs": [ |
|
"attention_mask_shifted", |
|
"position_ids_shifted", |
|
"attention_mask_before_quantizer", |
|
"position_ids_cos_before_quantizer", |
|
"position_ids_sin_before_quantizer" |
|
], |
|
"output_names_forwarder": { |
|
"attention_mask_shifted": "attention_mask_before_processor", |
|
"position_ids_shifted": "position_ids" |
|
}, |
|
"session_options": { |
|
"log_id": "onnxruntime-genai.position_shifter", |
|
"provider_options": [ |
|
{} |
|
] |
|
}, |
|
"run_on_prompt": false |
|
}, |
|
"quantizer": { |
|
"filename": "quantizer.onnx", |
|
"inputs": [ |
|
"attention_mask_before_quantizer", |
|
"position_ids_cos_before_quantizer", |
|
"position_ids_sin_before_quantizer" |
|
], |
|
"outputs": [ |
|
"attention_mask", |
|
"position_ids_cos", |
|
"position_ids_sin" |
|
], |
|
"session_options": { |
|
"log_id": "onnxruntime-genai.quantizer", |
|
"provider_options": [ |
|
{} |
|
] |
|
} |
|
}, |
|
"prompt-processor-1": { |
|
"filename": "prompt_1_of_3_qnn_ctx.onnx", |
|
"inputs": [ |
|
"input_ids" |
|
], |
|
"outputs": [ |
|
"_model_model_embed_tokens_Gather_output_0" |
|
], |
|
"session_options": { |
|
"log_id": "onnxruntime-genai.pp1", |
|
"provider_options": [ |
|
{ |
|
"qnn": { |
|
"backend_path": "libQnnHtp.so", |
|
"htp_performance_mode": "burst", |
|
"enable_htp_shared_memory_allocator": "1", |
|
"qnn_context_priority": "high" |
|
} |
|
} |
|
] |
|
}, |
|
"run_on_token_gen": false |
|
}, |
|
"prompt-processor-2": { |
|
"filename": "prompt_2_of_3_qnn_ctx.onnx", |
|
"inputs": [ |
|
"_model_model_embed_tokens_Gather_output_0", |
|
"attention_mask", |
|
"position_ids_cos", |
|
"position_ids_sin", |
|
"past_key_0_in", |
|
"past_value_0_in", |
|
"past_key_1_in", |
|
"past_value_1_in", |
|
"past_key_2_in", |
|
"past_value_2_in", |
|
"past_key_3_in", |
|
"past_value_3_in", |
|
"past_key_4_in", |
|
"past_value_4_in", |
|
"past_key_5_in", |
|
"past_value_5_in", |
|
"past_key_6_in", |
|
"past_value_6_in", |
|
"past_key_7_in", |
|
"past_value_7_in", |
|
"past_key_8_in", |
|
"past_value_8_in", |
|
"past_key_9_in", |
|
"past_value_9_in", |
|
"past_key_10_in", |
|
"past_value_10_in", |
|
"past_key_11_in", |
|
"past_value_11_in", |
|
"past_key_12_in", |
|
"past_value_12_in", |
|
"past_key_13_in", |
|
"past_value_13_in" |
|
], |
|
"outputs": [ |
|
"_model_model_layers_13_Add_1_output_0", |
|
"past_key_0_out", |
|
"past_value_0_out", |
|
"past_key_1_out", |
|
"past_value_1_out", |
|
"past_key_2_out", |
|
"past_value_2_out", |
|
"past_key_3_out", |
|
"past_value_3_out", |
|
"past_key_4_out", |
|
"past_value_4_out", |
|
"past_key_5_out", |
|
"past_value_5_out", |
|
"past_key_6_out", |
|
"past_value_6_out", |
|
"past_key_7_out", |
|
"past_value_7_out", |
|
"past_key_8_out", |
|
"past_value_8_out", |
|
"past_key_9_out", |
|
"past_value_9_out", |
|
"past_key_10_out", |
|
"past_value_10_out", |
|
"past_key_11_out", |
|
"past_value_11_out", |
|
"past_key_12_out", |
|
"past_value_12_out", |
|
"past_key_13_out", |
|
"past_value_13_out" |
|
], |
|
"session_options": { |
|
"log_id": "onnxruntime-genai.pp2", |
|
"provider_options": [ |
|
{ |
|
"qnn": { |
|
"backend_path": "libQnnHtp.so", |
|
"htp_performance_mode": "burst", |
|
"enable_htp_shared_memory_allocator": "1", |
|
"qnn_context_priority": "high" |
|
} |
|
} |
|
] |
|
}, |
|
"run_on_token_gen": false |
|
}, |
|
"prompt-processor-3": { |
|
"filename": "prompt_3_of_3_qnn_ctx.onnx", |
|
"inputs": [ |
|
"_model_model_layers_13_Add_1_output_0", |
|
"attention_mask", |
|
"position_ids_cos", |
|
"position_ids_sin", |
|
"past_key_14_in", |
|
"past_value_14_in", |
|
"past_key_15_in", |
|
"past_value_15_in", |
|
"past_key_16_in", |
|
"past_value_16_in", |
|
"past_key_17_in", |
|
"past_value_17_in", |
|
"past_key_18_in", |
|
"past_value_18_in", |
|
"past_key_19_in", |
|
"past_value_19_in", |
|
"past_key_20_in", |
|
"past_value_20_in", |
|
"past_key_21_in", |
|
"past_value_21_in", |
|
"past_key_22_in", |
|
"past_value_22_in", |
|
"past_key_23_in", |
|
"past_value_23_in", |
|
"past_key_24_in", |
|
"past_value_24_in", |
|
"past_key_25_in", |
|
"past_value_25_in", |
|
"past_key_26_in", |
|
"past_value_26_in", |
|
"past_key_27_in", |
|
"past_value_27_in" |
|
], |
|
"outputs": [ |
|
"logits", |
|
"past_key_14_out", |
|
"past_value_14_out", |
|
"past_key_15_out", |
|
"past_value_15_out", |
|
"past_key_16_out", |
|
"past_value_16_out", |
|
"past_key_17_out", |
|
"past_value_17_out", |
|
"past_key_18_out", |
|
"past_value_18_out", |
|
"past_key_19_out", |
|
"past_value_19_out", |
|
"past_key_20_out", |
|
"past_value_20_out", |
|
"past_key_21_out", |
|
"past_value_21_out", |
|
"past_key_22_out", |
|
"past_value_22_out", |
|
"past_key_23_out", |
|
"past_value_23_out", |
|
"past_key_24_out", |
|
"past_value_24_out", |
|
"past_key_25_out", |
|
"past_value_25_out", |
|
"past_key_26_out", |
|
"past_value_26_out", |
|
"past_key_27_out", |
|
"past_value_27_out" |
|
], |
|
"session_options": { |
|
"log_id": "onnxruntime-genai.pp3", |
|
"provider_options": [ |
|
{ |
|
"qnn": { |
|
"backend_path": "libQnnHtp.so", |
|
"htp_performance_mode": "burst", |
|
"enable_htp_shared_memory_allocator": "1", |
|
"qnn_context_priority": "high" |
|
} |
|
} |
|
] |
|
}, |
|
"run_on_token_gen": false |
|
}, |
|
"token-generator-1": { |
|
"filename": "token_1_of_3_qnn_ctx.onnx", |
|
"inputs": [ |
|
"input_ids" |
|
], |
|
"outputs": [ |
|
"_model_model_embed_tokens_Gather_output_0" |
|
], |
|
"session_options": { |
|
"log_id": "onnxruntime-genai.tg1", |
|
"provider_options": [ |
|
{ |
|
"qnn": { |
|
"backend_path": "libQnnHtp.so", |
|
"htp_performance_mode": "burst", |
|
"enable_htp_shared_memory_allocator": "1", |
|
"qnn_context_priority": "high" |
|
} |
|
} |
|
] |
|
}, |
|
"run_on_prompt": false |
|
}, |
|
"token-generator-2": { |
|
"filename": "token_2_of_3_qnn_ctx.onnx", |
|
"inputs": [ |
|
"_model_model_embed_tokens_Gather_output_0", |
|
"attention_mask", |
|
"position_ids_cos", |
|
"position_ids_sin", |
|
"past_key_0_in", |
|
"past_value_0_in", |
|
"past_key_1_in", |
|
"past_value_1_in", |
|
"past_key_2_in", |
|
"past_value_2_in", |
|
"past_key_3_in", |
|
"past_value_3_in", |
|
"past_key_4_in", |
|
"past_value_4_in", |
|
"past_key_5_in", |
|
"past_value_5_in", |
|
"past_key_6_in", |
|
"past_value_6_in", |
|
"past_key_7_in", |
|
"past_value_7_in", |
|
"past_key_8_in", |
|
"past_value_8_in", |
|
"past_key_9_in", |
|
"past_value_9_in", |
|
"past_key_10_in", |
|
"past_value_10_in", |
|
"past_key_11_in", |
|
"past_value_11_in", |
|
"past_key_12_in", |
|
"past_value_12_in", |
|
"past_key_13_in", |
|
"past_value_13_in" |
|
], |
|
"outputs": [ |
|
"_model_model_layers_13_Add_1_output_0", |
|
"past_key_0_out", |
|
"past_value_0_out", |
|
"past_key_1_out", |
|
"past_value_1_out", |
|
"past_key_2_out", |
|
"past_value_2_out", |
|
"past_key_3_out", |
|
"past_value_3_out", |
|
"past_key_4_out", |
|
"past_value_4_out", |
|
"past_key_5_out", |
|
"past_value_5_out", |
|
"past_key_6_out", |
|
"past_value_6_out", |
|
"past_key_7_out", |
|
"past_value_7_out", |
|
"past_key_8_out", |
|
"past_value_8_out", |
|
"past_key_9_out", |
|
"past_value_9_out", |
|
"past_key_10_out", |
|
"past_value_10_out", |
|
"past_key_11_out", |
|
"past_value_11_out", |
|
"past_key_12_out", |
|
"past_value_12_out", |
|
"past_key_13_out", |
|
"past_value_13_out" |
|
], |
|
"session_options": { |
|
"log_id": "onnxruntime-genai.tg2", |
|
"provider_options": [ |
|
{ |
|
"qnn": { |
|
"backend_path": "libQnnHtp.so", |
|
"htp_performance_mode": "burst", |
|
"enable_htp_shared_memory_allocator": "1", |
|
"qnn_context_priority": "high" |
|
} |
|
} |
|
] |
|
}, |
|
"run_on_prompt": false |
|
}, |
|
"token-generator-3": { |
|
"filename": "token_3_of_3_qnn_ctx.onnx", |
|
"inputs": [ |
|
"_model_model_layers_13_Add_1_output_0", |
|
"attention_mask", |
|
"position_ids_cos", |
|
"position_ids_sin", |
|
"past_key_14_in", |
|
"past_value_14_in", |
|
"past_key_15_in", |
|
"past_value_15_in", |
|
"past_key_16_in", |
|
"past_value_16_in", |
|
"past_key_17_in", |
|
"past_value_17_in", |
|
"past_key_18_in", |
|
"past_value_18_in", |
|
"past_key_19_in", |
|
"past_value_19_in", |
|
"past_key_20_in", |
|
"past_value_20_in", |
|
"past_key_21_in", |
|
"past_value_21_in", |
|
"past_key_22_in", |
|
"past_value_22_in", |
|
"past_key_23_in", |
|
"past_value_23_in", |
|
"past_key_24_in", |
|
"past_value_24_in", |
|
"past_key_25_in", |
|
"past_value_25_in", |
|
"past_key_26_in", |
|
"past_value_26_in", |
|
"past_key_27_in", |
|
"past_value_27_in" |
|
], |
|
"outputs": [ |
|
"logits", |
|
"past_key_14_out", |
|
"past_value_14_out", |
|
"past_key_15_out", |
|
"past_value_15_out", |
|
"past_key_16_out", |
|
"past_value_16_out", |
|
"past_key_17_out", |
|
"past_value_17_out", |
|
"past_key_18_out", |
|
"past_value_18_out", |
|
"past_key_19_out", |
|
"past_value_19_out", |
|
"past_key_20_out", |
|
"past_value_20_out", |
|
"past_key_21_out", |
|
"past_value_21_out", |
|
"past_key_22_out", |
|
"past_value_22_out", |
|
"past_key_23_out", |
|
"past_value_23_out", |
|
"past_key_24_out", |
|
"past_value_24_out", |
|
"past_key_25_out", |
|
"past_value_25_out", |
|
"past_key_26_out", |
|
"past_value_26_out", |
|
"past_key_27_out", |
|
"past_value_27_out" |
|
], |
|
"session_options": { |
|
"log_id": "onnxruntime-genai.tg3", |
|
"provider_options": [ |
|
{ |
|
"qnn": { |
|
"backend_path": "libQnnHtp.so", |
|
"htp_performance_mode": "burst", |
|
"enable_htp_shared_memory_allocator": "1", |
|
"qnn_context_priority": "high" |
|
} |
|
} |
|
] |
|
}, |
|
"run_on_prompt": false |
|
}, |
|
"dequantizer": { |
|
"filename": "dequantizer.onnx", |
|
"inputs": [ |
|
"logits" |
|
], |
|
"outputs": [ |
|
"logits_dequantized" |
|
], |
|
"session_options": { |
|
"log_id": "onnxruntime-genai.dequantizer", |
|
"provider_options": [ |
|
{} |
|
] |
|
} |
|
} |
|
} |
|
] |
|
}, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"pad_token_id": 128001, |
|
"type": "decoder-pipeline", |
|
"vocab_size": 128256 |
|
}, |
|
"search": { |
|
"diversity_penalty": 0.0, |
|
"do_sample": true, |
|
"early_stopping": true, |
|
"length_penalty": 1.0, |
|
"max_length": 131072, |
|
"min_length": 0, |
|
"no_repeat_ngram_size": 0, |
|
"num_beams": 1, |
|
"num_return_sequences": 1, |
|
"past_present_share_buffer": true, |
|
"repetition_penalty": 1.0, |
|
"temperature": 0.6, |
|
"top_k": 1, |
|
"top_p": 0.9 |
|
} |
|
} |
|
|